Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from dataflow.operators.code import (
CodeCodeToInstructionGenerator,
CodeInstructionToCodeGenerator,
CodeQualitySampleEvaluator,
CodeQualityScoreFilter,
CodeSandboxSampleEvaluator,
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
class CodeSFTSynthesis_APIPipeline():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/CodePipeline/code_synthesis_input.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
# Step 1: Code to Instruction synthesizer
self.instruction_synthesizer_step1 = CodeCodeToInstructionGenerator(
llm_serving=self.llm_serving
)
# Step 2: Instruction to Code generator
self.code_generator_step2 = CodeInstructionToCodeGenerator(
llm_serving=self.llm_serving
)
# Step 3: Quality evaluator for (instruction, code) pairs
self.pair_evaluator_step3 = CodeQualitySampleEvaluator(
llm_serving=self.llm_serving
)
# Step 4: Score-based filter
self.score_filter_step4 = CodeQualityScoreFilter(
llm_serving=self.llm_serving,
min_score=0.0,
max_score=10.0
)
# Step 5: Sandbox evaluator
self.sandbox_evaluator_step5 = CodeSandboxSampleEvaluator(
language='python'
)
def forward(self):
# Step 1: Generate instructions from raw code
self.instruction_synthesizer_step1.run(
storage=self.storage.step(),
input_key="raw_code",
output_key="generated_instruction"
)
# Step 2: Generate code from instructions
self.code_generator_step2.run(
storage=self.storage.step(),
input_key="generated_instruction",
output_key="generated_code"
)
# Step 3: Evaluate the generated (instruction, code) pairs
self.pair_evaluator_step3.run(
storage=self.storage.step(),
input_instruction_key="generated_instruction",
input_code_key="generated_code"
)
# Step 4: Filter out low-quality samples
self.score_filter_step4.run(
storage=self.storage.step(),
input_instruction_key = "generated_instruction",
input_code_key = "generated_code",
output_key="quality_score_filter_label"
)
# Step 5: Evaluate high-quality code in sandbox
self.sandbox_evaluator_step5.run(
storage=self.storage.step(),
input_key="generated_code"
)
if __name__ == "__main__":
model = CodeSFTSynthesis_APIPipeline()
model.forward()
\ No newline at end of file
from dataflow.operators.code import (
CodeCodeToInstructionGenerator,
CodeInstructionToCodeGenerator,
CodeQualitySampleEvaluator,
CodeQualityScoreFilter,
CodeSandboxSampleEvaluator,
CodeEnhancementInstructionGenerator,
CodeInstructionGenerator
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
class CodeGenDataset_APIPipeline():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/CodePipeline/raw_code.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=10
)
# Step 1: Instruction Enhancement
self.instruction_generater_step1 = CodeEnhancementInstructionGenerator(
llm_serving=self.llm_serving
)
# Step 2: Instruction to Code generator
self.code_generator_step2 = CodeInstructionToCodeGenerator(
llm_serving=self.llm_serving
)
# Step 3: Quality evaluator for (instruction, code) pairs
self.pair_evaluator_step3 = CodeQualitySampleEvaluator(
llm_serving=self.llm_serving
)
# Step 4: Score-based filter
self.score_filter_step4 = CodeQualityScoreFilter(
llm_serving=self.llm_serving,
min_score=7.0,
max_score=10.0
)
# Step 4: Sandbox evaluator
self.sandbox_evaluator_step5 = CodeSandboxSampleEvaluator(
language='python'
)
def forward(self):
# Step 1: Generate instructions from raw data
self.instruction_generater_step1.run(
storage=self.storage.step(),
input_key="messages",
output_key="generated_instruction",
)
#Step 2: Generate code from instructions
self.code_generator_step2.run(
storage=self.storage.step(),
input_key="generated_instruction",
output_key="generated_code"
)
# Step 3: Evaluate the generated (instruction, code) pairs
self.pair_evaluator_step3.run(
storage=self.storage.step(),
input_instruction_key="generated_instruction",
input_code_key="generated_code"
)
# Step 4: Filter out low-quality samples
self.score_filter_step4.run(
storage=self.storage.step(),
input_instruction_key = "generated_instruction",
input_code_key = "generated_code",
output_score_key = "quality_score",
output_feedback_key = "quality_feedback",
output_key="quality_score_filter_label"
)
# Step 5: Evaluate high-quality code in sandbox
self.sandbox_evaluator_step5.run(
storage=self.storage.step(),
input_key="generated_code"
)
if __name__ == "__main__":
model = CodeGenDataset_APIPipeline()
model.forward()
\ No newline at end of file
from dataflow.operators.core_text import BenchDatasetEvaluator
from dataflow.operators.reasoning import ReasoningAnswerGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
from dataflow.core import LLMServingABC
from dataflow.prompts.reasoning.diy import (
DiyAnswerGeneratorPrompt,
)
DIY_PROMPT_ANSWER = """Please output the answer."""
class BenchEvalPipeline():
def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/bench_eval_data_2.jsonl",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving_judger = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=30
)
self.evaluator_step = BenchDatasetEvaluator(
eval_result_path="./cache_local/eval_result/eval_result.jsonl",
compare_method="semantic", # or match
llm_serving=self.llm_serving_judger,
prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
)
def forward(self):
self.evaluator_step.run(
storage = self.storage.step(),
input_test_answer_key="model_answer",
input_gt_answer_key="golden_label"
)
if __name__ == "__main__":
pl = BenchEvalPipeline()
pl.forward()
from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
from dataflow.operators.reasoning import ReasoningAnswerGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
from dataflow.core import LLMServingABC
from dataflow.prompts.reasoning.diy import (
DiyAnswerGeneratorPrompt,
)
DIY_PROMPT_ANSWER = """Please output the answer."""
class BenchEvalPipeline():
def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.llm_serving_generator = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=1,
vllm_max_tokens=2048,
)
# use API server as LLM serving
self.llm_serving_judger = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=30
)
self.answer_generator_step1 = ReasoningAnswerGenerator(
llm_serving=self.llm_serving_generator,
prompt_template=DiyAnswerGeneratorPrompt(DIY_PROMPT_ANSWER)
)
self.evaluator_step2 = BenchDatasetEvaluatorQuestion(
eval_result_path="./cache_local/eval_result/eval_result.jsonl",
compare_method="semantic", # or match
llm_serving=self.llm_serving_judger,
prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
)
def forward(self):
self.answer_generator_step1.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "generated_cot"
)
self.evaluator_step2.run(
storage = self.storage.step(),
input_test_answer_key="generated_cot",
input_gt_answer_key="golden_answer",
input_question_key="instruction",
)
if __name__ == "__main__":
pl = BenchEvalPipeline()
pl.forward()
from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
from dataflow.operators.reasoning import ReasoningAnswerGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
from dataflow.core import LLMServingABC
from dataflow.prompts.reasoning.diy import (
DiyAnswerGeneratorPrompt,
)
DIY_PROMPT_ANSWER = """Please output the answer."""
class BenchEvalPipeline():
def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/bench_eval_data_2.jsonl",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving_judger = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=30
)
self.evaluator_step = BenchDatasetEvaluatorQuestion(
eval_result_path="./cache_local/eval_result/eval_result.jsonl",
compare_method="semantic", # or match
llm_serving=self.llm_serving_judger,
prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
)
def forward(self):
self.evaluator_step.run(
storage = self.storage.step(),
input_test_answer_key="model_answer",
input_gt_answer_key="golden_label",
input_question_key="question",
)
if __name__ == "__main__":
pl = BenchEvalPipeline()
pl.forward()
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.operators.conversations import (
ScenarioExtractGenerator,
ScenarioExpandGenerator,
AtomTaskGenerator,
SequentialTaskGenerator,
ParaSeqTaskGenerator,
CompositionTaskFilter,
FunctionGenerator,
MultiTurnConversationGenerator,
FuncCallConversationSampleEvaluator
)
class FuncCall_APIPipeline:
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/FuncCallPipeline/chat_data.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="http://123.129.219.111:3000/v1/chat/completions",
model_name="gpt-4o",
max_workers=128
)
self.scenario_extractor = ScenarioExtractGenerator(llm_serving=self.llm_serving)
self.scenario_expander = ScenarioExpandGenerator(llm_serving=self.llm_serving)
self.atom_task_generator = AtomTaskGenerator(llm_serving=self.llm_serving)
self.sequential_task_generator = SequentialTaskGenerator(llm_serving=self.llm_serving)
self.parallel_sequential_stak_generator = ParaSeqTaskGenerator(llm_serving=self.llm_serving)
self.composition_task_filter = CompositionTaskFilter(llm_serving=self.llm_serving)
self.function_generator = FunctionGenerator(llm_serving=self.llm_serving)
self.multi_turn_conversations_generator = MultiTurnConversationGenerator(llm_serving=self.llm_serving)
self.evaluator = FuncCallConversationSampleEvaluator(llm_serving=self.llm_serving)
def forward(self):
self.scenario_extractor.run(
self.storage.step(),
input_chat_key="chat"
)
self.scenario_expander.run(
self.storage.step(),
input_scenario_key="scenario"
)
self.atom_task_generator.run(
self.storage.step(),
input_scenario_key="scenario"
)
# self.atom_task_generator.run(
# self.storage.step(),
# input_scenario_key="modified_scenario",
# output_key='subsequent_task'
# )
self.sequential_task_generator.run(
self.storage.step(),
input_task_key="atom_task"
)
# self.parallel_sequential_stak_generator.run(
# self.storage.step(),
# input_task_key="atom_task"
# )
self.composition_task_filter.run(
self.storage.step(),
input_composition_task_key="composition_task",
input_sub_tasks_keys=["atom_task", "subsequent_task"]
)
self.function_generator.run(
self.storage.step(),
input_composition_task_key="composition_task",
input_sub_tasks_keys=["atom_task", "subsequent_task"]
)
self.multi_turn_conversations_generator.run(
self.storage.step(),
input_task_key="composition_task",
input_sub_tasks_keys=["atom_task", "subsequent_task"],
input_functions_key="functions",
)
self.evaluator.run(
self.storage.step(),
input_conversation_key='conversations'
)
if __name__ == "__main__":
pipeline = FuncCall_APIPipeline()
pipeline.forward()
\ No newline at end of file
from dataflow.operators.knowledge_cleaning import (
KBCChunkGenerator,
FileOrURLToMarkdownConverterBatch,
KBCTextCleaner,
# KBCMultiHopQAGenerator,
)
from dataflow.operators.core_text import Text2MultiHopQAGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
class KBCleaningPDF_APIPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/KBCleaningPipeline/kbc_test_1.jsonl",
cache_path="./.cache/api",
file_name_prefix="knowledge_cleaning_step",
cache_type="json",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterBatch(
intermediate_dir="../example_data/KBCleaningPipeline/raw/",
lang="en",
mineru_backend="vlm-vllm-engine",
)
self.knowledge_cleaning_step2 = KBCChunkGenerator(
split_method="token",
chunk_size=512,
tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
)
self.knowledge_cleaning_step3 = KBCTextCleaner(
llm_serving=self.llm_serving,
lang="en"
)
self.knowledge_cleaning_step4 = Text2MultiHopQAGenerator(
llm_serving=self.llm_serving,
lang="en",
num_q = 5
)
def forward(self):
extracted=self.knowledge_cleaning_step1.run(
storage=self.storage.step(),
# input_key=,
# output_key=,
)
self.knowledge_cleaning_step2.run(
storage=self.storage.step(),
# input_key=,
# output_key=,
)
self.knowledge_cleaning_step3.run(
storage=self.storage.step(),
# input_key=,
# output_key=,
)
self.knowledge_cleaning_step4.run(
storage=self.storage.step(),
# input_key=,
# output_key=,
)
if __name__ == "__main__":
model = KBCleaningPDF_APIPipeline()
model.forward()
\ No newline at end of file
import os
import sys
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
from dataflow.operators.pdf2vqa import VQAExtractor
class VQA_extract_optimized_pipeline:
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="./example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
cache_path="./cache",
file_name_prefix="vqa",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
key_name_of_api_key="DF_API_KEY",
model_name="gemini-2.5-pro",
max_workers=100,
)
self.vqa_extractor = VQAExtractor(
llm_serving=self.llm_serving,
mineru_backend='vlm-vllm-engine',
max_chunk_len=128000
)
def forward(self):
# 单一算子:包含预处理、QA提取、后处理的所有功能
self.vqa_extractor.run(
storage=self.storage.step(),
input_question_pdf_path_key="question_pdf_path",
input_answer_pdf_path_key="answer_pdf_path",
input_pdf_path_key="pdf_path", # 支持 interleaved 模式
input_subject_key="subject",
output_dir_key="output_dir",
output_jsonl_key="output_jsonl_path",
)
if __name__ == "__main__":
# jsonl中每一行包含question_pdf_path, answer_pdf_path, subject (math, physics, chemistry, ...), output_dir
# 如果question和answer在同一份pdf中,请将question_pdf_path和answer_pdf_path设置为相同的路径,会自动切换为interleaved模式
pipeline = VQA_extract_optimized_pipeline()
pipeline.forward()
\ No newline at end of file
from dataflow.operators.reasoning import (
ReasoningQuestionGenerator,
ReasoningAnswerGenerator,
)
from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
from dataflow.prompts.reasoning.diy import (
DiyQuestionFilterPrompt,
DiyAnswerGeneratorPrompt,
DiyQuestionSynthesisPrompt
)
"""
if the 'prompt_template' is not None and the 'content_type is set to 'diy', please check the input and output format, the same as default prompt
"""
DIY_PROMPT_QUESTION = """Please only keep the medical related data (judgement_test is true), for other data the judgement_test is false.
After these steps, output exactly:
{{
"judgement_test": true/false,
"error_type": "<error description or null>"
}}
You may include your chain of thought, but the final output must be the JSON above.
Here is the content to evaluate:
-------------------------------
{question}
-------------------------------
"""
DIY_PROMPT_SYNTHESIS = """
Please construct some new sports related data from source problem.
Here is the problem from the user:
{question}
Write another problem inspired by this one.
Not only change the problem scenario, but also try to create a new problem that requires another approach to solve.
Start directly with the problem statement and DO NOT include any phrases such as "Here is a new problem inspired by a given one".
After the problem is generated finish your response right away.
"""
DIY_PROMPT_ANSWER = """Please firstly output a symbol "Yeah, It is the answer:", and then output the answer."""
class DiyReasoning_APIPipeline():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/ReasoningPipeline/pipeline_general.json",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=30
)
self.question_filter_step1 = ReasoningQuestionFilter(
system_prompt="You are an expert in evaluating problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
llm_serving=self.llm_serving,
prompt_template=DiyQuestionFilterPrompt(DIY_PROMPT_QUESTION)
)
self.question_gen_step2 = ReasoningQuestionGenerator(
num_prompts=1,
llm_serving=self.llm_serving,
prompt_template=DiyQuestionSynthesisPrompt(DIY_PROMPT_SYNTHESIS)
)
self.answer_generator_step3 = ReasoningAnswerGenerator(
llm_serving=self.llm_serving,
prompt_template=DiyAnswerGeneratorPrompt(DIY_PROMPT_ANSWER)
)
self.answer_ngram_filter_step4 = ReasoningAnswerNgramFilter(
min_score = 0.1,
max_score = 1.0,
ngrams = 5
)
def forward(self):
self.question_filter_step1.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_gen_step2.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.answer_generator_step3.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "generated_cot"
)
self.answer_ngram_filter_step4.run(
storage = self.storage.step(),
input_question_key = "instruction",
input_answer_key = "generated_cot"
)
if __name__ == "__main__":
pl = DiyReasoning_APIPipeline()
pl.forward()
from dataflow.operators.reasoning import (
ReasoningQuestionGenerator,
ReasoningAnswerGenerator,
)
from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter, ReasoningAnswerModelJudgeFilter
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
from dataflow.prompts.reasoning.general import (
GeneralQuestionFilterPrompt,
GeneralAnswerGeneratorPrompt,
GeneralQuestionSynthesisPrompt,
)
from dataflow.prompts.model_evaluation.general import AnswerJudgePrompt
class GeneralReasoning_APIPipeline():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/ReasoningPipeline/pipeline_general.json",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=30
)
self.question_filter_step1 = ReasoningQuestionFilter(
system_prompt="You are an expert in evaluating problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
llm_serving=self.llm_serving,
prompt_template=GeneralQuestionFilterPrompt()
)
self.question_gen_step2 = ReasoningQuestionGenerator(
num_prompts=1,
llm_serving=self.llm_serving,
prompt_template=GeneralQuestionSynthesisPrompt()
)
self.answer_generator_step3 = ReasoningAnswerGenerator(
llm_serving=self.llm_serving,
prompt_template=GeneralAnswerGeneratorPrompt()
)
self.answer_model_judge_step4 = ReasoningAnswerModelJudgeFilter(
llm_serving=self.llm_serving,
prompt_template=AnswerJudgePrompt(),
keep_all_samples=True
)
self.answer_ngram_filter_step5 = ReasoningAnswerNgramFilter(
min_score = 0.1,
max_score = 1.0,
ngrams = 5
)
def forward(self):
self.question_filter_step1.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_gen_step2.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.answer_generator_step3.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "generated_cot"
),
self.answer_model_judge_step4.run(
storage = self.storage.step(),
input_question_key = "instruction",
input_answer_key = "generated_cot",
input_reference_key = "golden_answer"
),
self.answer_ngram_filter_step5.run(
storage = self.storage.step(),
input_question_key = "instruction",
input_answer_key = "generated_cot"
)
if __name__ == "__main__":
pl = GeneralReasoning_APIPipeline()
pl.forward()
from dataflow.operators.reasoning import (
ReasoningQuestionCategorySampleEvaluator,
ReasoningQuestionDifficultySampleEvaluator,
ReasoningQuestionGenerator,
ReasoningAnswerGenerator,
)
from dataflow.operators.reasoning import (
ReasoningQuestionFilter,
ReasoningAnswerFormatterFilter,
ReasoningAnswerGroundTruthFilter,
ReasoningAnswerTokenLengthFilter,
ReasoningAnswerNgramFilter
)
from dataflow.prompts.reasoning.math import (
MathQuestionFilterPrompt,
MathAnswerGeneratorPrompt,
MathQuestionSynthesisPrompt
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
# 这里或许未来可以有个pipeline基类
class ReasoningMath_APIPipeline():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
# if llm_serving is None:
# # use local model as LLM serving
# llm_serving = LocalModelLLMServing(
# # model_name_or_path="/data0/models/Qwen2.5-7B-Instruct", # set to your own model path
# model_name_or_path="/mnt/public/model/huggingface/Qwen2.5-7B-Instruct",
# tensor_parallel_size=4,
# max_tokens=8192,
# model_source="local"
# )
self.question_filter_step1 = ReasoningQuestionFilter(
system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
llm_serving=self.llm_serving,
prompt_template=MathQuestionFilterPrompt()
)
self.question_gen_step2 = ReasoningQuestionGenerator(
num_prompts=3,
llm_serving=self.llm_serving,
prompt_template=MathQuestionSynthesisPrompt()
)
self.question_filter_step3 = ReasoningQuestionFilter(
system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
llm_serving=self.llm_serving,
prompt_template=MathQuestionFilterPrompt()
)
self.question_difficulty_classifier_step4 = ReasoningQuestionDifficultySampleEvaluator(
llm_serving=self.llm_serving
)
self.question_category_classifier_step5 = ReasoningQuestionCategorySampleEvaluator(
llm_serving=self.llm_serving
)
########################## branch ############################
# self.answer_pipeline_root_step6 = AnswerPipelineRoot()
########################## answer ############################
self.answer_generator_step7 = ReasoningAnswerGenerator(
llm_serving=self.llm_serving,
prompt_template=MathAnswerGeneratorPrompt()
)
self.answer_format_filter_step8 = ReasoningAnswerFormatterFilter()
self.answer_token_length_filter_step9 = ReasoningAnswerTokenLengthFilter(
max_answer_token_length = 8192,
tokenizer_dir = "Qwen/Qwen2.5-0.5B-Instruct"
)
self.answer_groundtruth_filter_step10 = ReasoningAnswerGroundTruthFilter()
self.answer_ngram_filter_step11 = ReasoningAnswerNgramFilter(
min_score = 0.1,
max_score = 1.0,
ngrams = 5
)
# 未来或许可以维护一个类似nn.sequential的容器,方便添加并实例化多个算子
def forward(self):
self.question_filter_step1.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_gen_step2.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_filter_step3.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_difficulty_classifier_step4.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "question_difficulty"
)
self.question_category_classifier_step5.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "question_category"
)
############# branch #############
# self.answer_pipeline_root_step6.run(
# storage = self.storage.step(),
# input_answer_key = "output",
# input_gt_key = "golden_answer"
# )
############## answer #############
self.answer_generator_step7.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "generated_cot"
)
self.answer_format_filter_step8.run(
storage = self.storage.step(),
input_key = "generated_cot",
)
self.answer_token_length_filter_step9.run(
storage = self.storage.step(),
input_key = "generated_cot"
)
self.answer_groundtruth_filter_step10.run(
storage = self.storage.step(),
input_test_answer_key = "generated_cot",
input_gt_answer_key = "golden_answer"
)
self.answer_ngram_filter_step11.run(
storage = self.storage.step(),
input_question_key = "instruction",
input_answer_key = "generated_cot"
)
if __name__ == "__main__":
model = ReasoningMath_APIPipeline()
model.forward()
from dataflow.operators.reasoning import (
ReasoningQuestionFusionGenerator,
ReasoningQuestionSolvableSampleEvaluator,
)
from dataflow.operators.core_text import (
PandasOperator,
EmbeddingGenerator,
)
from dataflow.prompts.reasoning.math import (
MathQuestionParallelFusionGeneratorPrompt,
MathQuestionSequentialFusionGeneratorPrompt,
MathQuestionConditionFusionGeneratorPrompt,
MathQuestionEvaluatorPrompt
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.core import LLMServingABC
import torch
import numpy as np
import pandas as pd
import re
# 这里或许未来可以有个pipeline基类
class ReasoningMath_APIPipeline_Mathfusion():
def __init__(self, llm_serving: LLMServingABC = None):
self.storage = FileStorage(
first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
# use API server as LLM serving
llm_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=10
)
embedding_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/embeddings",
model_name="text-embedding-ada-002",
max_workers=10
)
self.embedding_generator = EmbeddingGenerator(embedding_serving=embedding_serving)
def find_most_similar_questions(df):
df = df.dropna(subset=['embeddings']).reset_index(drop=True)
embeddings = torch.tensor(np.stack(df['embeddings'].values), dtype=torch.float32).cuda() # shape: (n, d)
sim_matrix = torch.matmul(embeddings, embeddings.T) # shape: (n, n)
sim_matrix.fill_diagonal_(-float('inf'))
most_similar_idx = torch.argmax(sim_matrix, dim=1).cpu().numpy()
df['most_similar_problem'] = df['question'].iloc[most_similar_idx].values
return df
self.most_similar_matcher = PandasOperator([
find_most_similar_questions
])
self.extract_pair = PandasOperator([ # dropping embeddings to decrease file size
lambda df: df.drop(columns=[col for col in df.columns if "embeddings" in col])
])
self.sequential_fusion = ReasoningQuestionFusionGenerator(
num_prompts=1,
llm_serving=llm_serving,
prompt_template=MathQuestionSequentialFusionGeneratorPrompt()
)
self.parallel_fusion = ReasoningQuestionFusionGenerator(
num_prompts=1,
llm_serving=llm_serving,
prompt_template=MathQuestionParallelFusionGeneratorPrompt()
)
self.condition_fusion = ReasoningQuestionFusionGenerator(
num_prompts=2,
llm_serving=llm_serving,
prompt_template=MathQuestionConditionFusionGeneratorPrompt()
)
def combined(df: pd.DataFrame) -> pd.DataFrame:
"""
Combine all question-related columns into a single long-format DataFrame.
Automatically detects columns containing '_question_{i}' patterns.
"""
# 始终保留原始 question
question_cols = ["question"] if "question" in df.columns else []
# 匹配所有类似 *_question_0, *_question_1, ...
pattern = re.compile(r".*_question_\d+$")
question_cols.extend([col for col in df.columns if pattern.match(col)])
if not question_cols:
raise ValueError("No question columns found matching pattern '_question_{i}'.")
# 转换成长表
long_df = df.melt(value_vars=question_cols, value_name="questions")[["questions"]]
# 去除空值与重复项(通常同一问题会出现重复)
long_df = long_df.dropna(subset=["questions"]).drop_duplicates().reset_index(drop=True)
return long_df
self.combined_question = PandasOperator([combined])
self.question_evaluation = ReasoningQuestionSolvableSampleEvaluator(llm_serving=llm_serving, prompt_template=MathQuestionEvaluatorPrompt())
def extract_new_problem(df: pd.DataFrame) -> pd.DataFrame:
"""
Extract the content after '#New Problem#' from the 'questions' column
and store it in a new column 'new_problem'.
"""
if "questions" not in df.columns:
raise ValueError("Input DataFrame must contain a 'questions' column.")
def _extract(text: str) -> str:
if not isinstance(text, str):
return None
match = re.search(r"#New Problem#[:\s]*(.*)", text, re.DOTALL)
return match.group(1).strip() if match else None
df = df.copy()
df["refined_question"] = df["questions"].apply(_extract)
df = df.dropna(subset=["refined_question"]).reset_index(drop=True)
return df
self.extract_new_problem = PandasOperator([extract_new_problem])
def forward(self):
# self.first10.run(
# storage = self.storage.step(),
# )
self.embedding_generator.run(
storage = self.storage.step(),
input_key = "question",
output_key = "embeddings",
)
self.most_similar_matcher.run(
storage = self.storage.step(),
)
self.extract_pair.run(
storage = self.storage.step(),
)
self.sequential_fusion.run(
storage = self.storage.step(),
input_problem_1= "question",
input_problem_2= "most_similar_problem",
output_key="sequential_fusion",
)
self.parallel_fusion.run(
storage = self.storage.step(),
input_problem_1= "question",
input_problem_2= "most_similar_problem",
output_key="parallel_fusion"
)
self.condition_fusion.run(
storage = self.storage.step(),
input_problem_1= "question",
input_problem_2= "most_similar_problem",
output_key="condition_fusion"
)
self.combined_question.run(
storage = self.storage.step()
)
self.question_evaluation.run(
storage = self.storage.step(),
input_key = "questions",
output_key= "question_solvability"
)
self.extract_new_problem.run(
storage = self.storage.step()
)
if __name__ == "__main__":
pl = ReasoningMath_APIPipeline_Mathfusion()
pl.forward()
from dataflow.operators.reasoning import (
ReasoningQuestionGenerator,
ReasoningAnswerGenerator,
ReasoningPretrainFormatConvertGenerator
)
from dataflow.prompts.reasoning.math import (
MathQuestionFilterPrompt,
MathQuestionSynthesisPrompt,
MathAnswerGeneratorPrompt
)
from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter, ReasoningAnswerPipelineRootFilter
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
# 这里或许未来可以有个pipeline基类
class Reasoning_APIPipeline_Pretrain():
def __init__(self, llm_serving=None):
self.storage = FileStorage(
first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
if llm_serving is None:
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="http://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
## use local model as LLM serving
# llm_serving = LocalModelLLMServing(
# # model_name_or_path="/data0/models/Qwen2.5-7B-Instruct", # set to your own model path
# model_name_or_path="/mnt/public/model/huggingface/Qwen2.5-7B-Instruct",
# tensor_parallel_size=4,
# max_tokens=1024,
# model_source="local"
# )
self.question_filter_step1 = ReasoningQuestionFilter(
system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
llm_serving=self.llm_serving,
prompt_template=MathQuestionFilterPrompt()
)
self.question_gen_step2 = ReasoningQuestionGenerator(
num_prompts=3,
llm_serving=self.llm_serving,
prompt_template=MathQuestionSynthesisPrompt()
)
########################## branch ############################
self.answer_pipeline_root_step3 = ReasoningAnswerPipelineRootFilter()
########################## answer ############################
self.answer_generator_step4 = ReasoningAnswerGenerator(
llm_serving=self.llm_serving,
prompt_template=MathAnswerGeneratorPrompt()
)
self.answer_ngram_filter_step5 = ReasoningAnswerNgramFilter(
min_score = 0.1,
max_score = 1.0,
ngrams = 5
)
self.sft_to_pretrain_step6 = ReasoningPretrainFormatConvertGenerator()
# 未来或许可以维护一个类似nn.sequential的容器,方便添加并实例化多个算子
def forward(self):
self.question_filter_step1.run(
storage = self.storage.step(),
input_key = "instruction",
)
self.question_gen_step2.run(
storage = self.storage.step(),
input_key = "instruction",
)
############# branch #############
self.answer_pipeline_root_step3.run(
storage = self.storage.step(),
input_answer_key = "output",
input_gt_key = "golden_answer"
)
############## answer #############
self.answer_generator_step4.run(
storage = self.storage.step(),
input_key = "instruction",
output_key = "generated_cot"
)
self.answer_ngram_filter_step5.run(
storage = self.storage.step(),
input_question_key = "instruction",
input_answer_key = "generated_cot"
)
self.sft_to_pretrain_step6.run(
storage = self.storage.step(),
input_read_key_question="instruction",
input_read_key_answer="generated_cot",
output_key="text",
)
if __name__ == "__main__":
pipeline = Reasoning_APIPipeline_Pretrain()
pipeline.forward()
from dataflow.operators.core_text import (
Text2QASampleEvaluator,
Text2QAGenerator,
KCenterGreedyFilter
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.serving import LocalModelLLMServing_vllm
class AgenticRAG_APIPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json",
cache_path="./cache_local",
file_name_prefix="dataflow_cache_step",
cache_type="json",
)
# use API server as LLM serving
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=1
)
embedding_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/embeddings",
model_name="text-embedding-ada-002",
max_workers=100
)
self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5)
self.text2qa_generator_step3 = Text2QAGenerator(self.llm_serving)
self.text2qa_scorer_step4 = Text2QASampleEvaluator(self.llm_serving)
def forward(self):
self.content_chooser_step1.run(
storage = self.storage.step(),
input_key = "text"
)
self.text2qa_generator_step3.run(
storage = self.storage.step(),
input_key="text",
input_question_num= 3,
output_prompt_key="generated_prompt",
output_quesion_key="generated_question",
output_answer_key="generated_answer"
)
self.text2qa_scorer_step4.run(
storage = self.storage.step(),
input_question_key="generated_question",
input_answer_key="generated_answer",
output_question_quality_key="question_quality_grades",
output_question_quality_feedback_key="question_quality_feedbacks",
output_answer_alignment_key="answer_alignment_grades",
output_answer_alignment_feedback_key="answer_alignment_feedbacks",
output_answer_verifiability_key="answer_verifiability_grades",
)
if __name__ == "__main__":
model = AgenticRAG_APIPipeline()
model.forward()
import os
from dataflow import get_logger
import zipfile
from huggingface_hub import snapshot_download
from dataflow.operators.text2sql import (
SQLGenerator,
SQLByColumnGenerator,
Text2SQLQuestionGenerator,
Text2SQLPromptGenerator,
Text2SQLCoTGenerator
)
from dataflow.operators.text2sql import (
SQLExecutionFilter
)
from dataflow.operators.text2sql import (
SQLComponentClassifier,
SQLExecutionClassifier
)
from dataflow.prompts.text2sql import (
Text2SQLCotGeneratorPrompt,
SelectSQLGeneratorPrompt,
Text2SQLQuestionGeneratorPrompt,
Text2SQLPromptGeneratorPrompt
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.utils.text2sql.database_manager import DatabaseManager
def download_and_extract_database(logger):
dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-database-example"
local_dir = "./hf_cache"
extract_to = "./downloaded_databases"
logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
db_exp_folder_path = os.path.join(extract_to, "databases")
if os.path.exists(db_exp_folder_path):
return db_exp_folder_path
os.makedirs(local_dir, exist_ok=True)
os.makedirs(extract_to, exist_ok=True)
downloaded_path = snapshot_download(
repo_id=dataset_repo_id,
repo_type="dataset",
local_dir=local_dir,
resume_download=True
)
logger.info(f"Files downloaded to: {downloaded_path}")
zip_path = os.path.join(downloaded_path, "databases.zip")
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
logger.info(f"Database files extracted to {extract_to}")
return db_exp_folder_path
else:
raise FileNotFoundError(f"Database zip file not found at {zip_path}")
class Text2SQLGeneration_APIPipeline():
def __init__(self, db_root_path=""):
self.logger = get_logger()
self.db_root_path = db_root_path
if not db_root_path:
try:
self.db_root_path = download_and_extract_database(self.logger)
self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
except Exception as e:
self.logger.error(f"Failed to auto-download database: {e}")
raise
else:
self.logger.info(f"Using manually specified database path: {self.db_root_path}")
if not os.path.exists(self.db_root_path):
raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
self.storage = FileStorage(
first_entry_file_name="",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
# It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
cot_generation_api_llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o", # You can change to a more powerful model for CoT generation
max_workers=100
)
embedding_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/embeddings",
model_name="text-embedding-ada-002",
max_workers=100
)
# SQLite and MySQL are currently supported
# db_type can be sqlite or mysql, which must match your database type
# If sqlite is selected, root_path must be provided, this path must exist and contain database files
# If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
# MySQL example:
# database_manager = DatabaseManager(
# db_type="mysql",
# config={
# "host": "localhost",
# "user": "root",
# "password": "your_password",
# "database": "your_database_name"
# }
# )
# SQLite example:
database_manager = DatabaseManager(
db_type="sqlite",
config={
"root_path": self.db_root_path
}
)
self.sql_generator_step1 = SQLGenerator(
llm_serving=self.llm_serving,
database_manager=database_manager,
generate_num=2,
prompt_template=SelectSQLGeneratorPrompt()
)
self.sql_execution_filter_step2 = SQLExecutionFilter(
database_manager=database_manager
)
self.text2sql_question_generator_step3 = Text2SQLQuestionGenerator(
llm_serving=self.llm_serving,
embedding_serving=embedding_serving,
database_manager=database_manager,
question_candidates_num=5,
prompt_template=Text2SQLQuestionGeneratorPrompt()
)
self.text2sql_prompt_generator_step4 = Text2SQLPromptGenerator(
database_manager=database_manager,
prompt_template=Text2SQLPromptGeneratorPrompt()
)
self.sql_cot_generator_step5 = Text2SQLCoTGenerator(
llm_serving=cot_generation_api_llm_serving,
database_manager=database_manager,
prompt_template=Text2SQLCotGeneratorPrompt()
)
self.sql_component_classifier_step6 = SQLComponentClassifier(
difficulty_thresholds=[2, 4, 6],
difficulty_labels=['easy', 'medium', 'hard', 'extra']
)
self.sql_execution_classifier_step7 = SQLExecutionClassifier(
llm_serving=self.llm_serving,
database_manager=database_manager,
num_generations=10,
difficulty_thresholds=[2, 5, 9],
difficulty_labels=['extra', 'hard', 'medium', 'easy']
)
def forward(self):
sql_key = "SQL"
db_id_key = "db_id"
question_key = "question"
evidence_key = "evidence"
self.sql_generator_step1.run(
storage=self.storage.step(),
output_sql_key=sql_key,
output_db_id_key=db_id_key
)
self.sql_execution_filter_step2.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key
)
self.text2sql_question_generator_step3.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
output_question_key=question_key,
output_evidence_key=evidence_key
)
self.text2sql_prompt_generator_step4.run(
storage=self.storage.step(),
input_question_key=question_key,
input_db_id_key=db_id_key,
input_evidence_key=evidence_key,
output_prompt_key="prompt"
)
self.sql_cot_generator_step5.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_question_key=question_key,
input_db_id_key=db_id_key,
input_evidence_key=evidence_key,
output_cot_key="cot_reasoning"
)
self.sql_component_classifier_step6.run(
storage=self.storage.step(),
input_sql_key=sql_key,
output_difficulty_key="sql_component_difficulty"
)
self.sql_execution_classifier_step7.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
input_prompt_key="prompt",
output_difficulty_key="sql_execution_difficulty"
)
if __name__ == "__main__":
# If you have your own database files, you can set the db_root_path to the path of your database files
# If not, please set the db_root_path "", and we will download the example database files automatically
db_root_path = ""
model = Text2SQLGeneration_APIPipeline(db_root_path=db_root_path)
model.forward()
import os
from dataflow import get_logger
import zipfile
from pathlib import Path
from huggingface_hub import snapshot_download
from dataflow.operators.text2sql import (
SQLVariationGenerator,
Text2SQLQuestionGenerator,
Text2SQLPromptGenerator,
Text2SQLCoTGenerator
)
from dataflow.operators.text2sql import (
SQLExecutionFilter,
SQLConsistencyFilter
)
from dataflow.operators.text2sql import (
SQLComponentClassifier,
SQLExecutionClassifier
)
from dataflow.prompts.text2sql import (
SQLConsistencyFilterPrompt,
Text2SQLCotGeneratorPrompt,
Text2SQLQuestionGeneratorPrompt,
SQLVariationGeneratorPrompt,
Text2SQLPromptGeneratorPrompt
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.utils.text2sql.database_manager import DatabaseManager
def download_and_extract_database(logger):
dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-database-example"
local_dir = "./hf_cache"
extract_to = "./downloaded_databases"
logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
db_exp_folder_path = os.path.join(extract_to, "databases")
if os.path.exists(db_exp_folder_path):
return db_exp_folder_path
os.makedirs(local_dir, exist_ok=True)
os.makedirs(extract_to, exist_ok=True)
downloaded_path = snapshot_download(
repo_id=dataset_repo_id,
repo_type="dataset",
local_dir=local_dir,
resume_download=True
)
logger.info(f"Files downloaded to: {downloaded_path}")
zip_path = os.path.join(downloaded_path, "databases.zip")
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
logger.info(f"Database files extracted to {extract_to}")
return db_exp_folder_path
else:
raise FileNotFoundError(f"Database zip file not found at {zip_path}")
class Text2SQLRefine_APIPipeline():
def __init__(self, db_root_path=""):
self.logger = get_logger()
self.db_root_path = db_root_path
if not db_root_path:
try:
self.db_root_path = download_and_extract_database(self.logger)
self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
except Exception as e:
self.logger.error(f"Failed to auto-download database: {e}")
raise
else:
self.logger.info(f"Using manually specified database path: {self.db_root_path}")
if not os.path.exists(self.db_root_path):
raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
self.storage = FileStorage(
first_entry_file_name="../example_data/Text2SQLPipeline/pipeline_refine.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl"
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
# It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
cot_generation_api_llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o", # You can change to a more powerful model for CoT generation
max_workers=100
)
embedding_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/embeddings",
model_name="text-embedding-ada-002",
max_workers=100
)
# SQLite and MySQL are currently supported
# db_type can be sqlite or mysql, which must match your database type
# If sqlite is selected, root_path must be provided, this path must exist and contain database files
# If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
# MySQL example:
# database_manager = DatabaseManager(
# db_type="mysql",
# config={
# "host": "localhost",
# "user": "root",
# "password": "your_password",
# "database": "your_database_name"
# }
# )
# SQLite example:
database_manager = DatabaseManager(
db_type="sqlite",
config={
"root_path": self.db_root_path
}
)
self.sql_execution_filter_step1 = SQLExecutionFilter(
database_manager=database_manager
)
self.sql_consistency_filter_step2 = SQLConsistencyFilter(
llm_serving=self.llm_serving,
database_manager=database_manager,
prompt_template=SQLConsistencyFilterPrompt()
)
self.sql_variation_generator_step3 = SQLVariationGenerator(
llm_serving=self.llm_serving,
database_manager=database_manager,
num_variations=1, # Number of variations to generate for each SQL
prompt_template=SQLVariationGeneratorPrompt()
)
self.sql_execution_filter_step4 = SQLExecutionFilter(
database_manager=database_manager
)
self.text2sql_question_generator_step5 = Text2SQLQuestionGenerator(
llm_serving=self.llm_serving,
embedding_serving=embedding_serving,
database_manager=database_manager,
question_candidates_num=5,
prompt_template=Text2SQLQuestionGeneratorPrompt()
)
self.text2sql_prompt_generator_step6 = Text2SQLPromptGenerator(
database_manager=database_manager,
prompt_template=Text2SQLPromptGeneratorPrompt()
)
self.sql_cot_generator_step7 = Text2SQLCoTGenerator(
llm_serving=cot_generation_api_llm_serving,
database_manager=database_manager,
prompt_template=Text2SQLCotGeneratorPrompt()
)
self.sql_component_classifier_step8 = SQLComponentClassifier(
difficulty_thresholds=[2, 4, 6],
difficulty_labels=['easy', 'medium', 'hard', 'extra']
)
self.sql_execution_classifier_step9 = SQLExecutionClassifier(
llm_serving=self.llm_serving,
database_manager=database_manager,
num_generations=10,
difficulty_thresholds=[2, 5, 9],
difficulty_labels=['extra', 'hard', 'medium', 'easy']
)
def forward(self):
sql_key = "SQL"
db_id_key = "db_id"
question_key = "question"
evidence_key = "evidence"
self.sql_execution_filter_step1.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key
)
self.sql_consistency_filter_step2.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
input_question_key=question_key
)
self.sql_variation_generator_step3.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key
)
self.sql_execution_filter_step4.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key
)
self.text2sql_question_generator_step5.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
output_question_key=question_key,
output_evidence_key=evidence_key
)
self.text2sql_prompt_generator_step6.run(
storage=self.storage.step(),
input_question_key=question_key,
input_db_id_key=db_id_key,
input_evidence_key=evidence_key,
output_prompt_key="prompt"
)
self.sql_cot_generator_step7.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_question_key=question_key,
input_db_id_key=db_id_key,
input_evidence_key=evidence_key,
output_cot_key="cot_reasoning"
)
self.sql_component_classifier_step8.run(
storage=self.storage.step(),
input_sql_key=sql_key,
output_difficulty_key="sql_component_difficulty"
)
self.sql_execution_classifier_step9.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
input_prompt_key="prompt",
output_difficulty_key="sql_execution_difficulty"
)
if __name__ == "__main__":
# If you have your own database files, you can set the db_root_path to the path of your database files
# If not, please set the db_root_path "", and we will download the example database files automatically
db_root_path = ""
model = Text2SQLRefine_APIPipeline(db_root_path=db_root_path)
model.forward()
import os
from dataflow import get_logger
import zipfile
from pathlib import Path
from huggingface_hub import snapshot_download
from dataflow.operators.text2sql import (
SQLByColumnGenerator,
Text2SQLQuestionGenerator,
Text2SQLPromptGenerator
)
from dataflow.operators.text2sql import (
SQLExecutionFilter
)
from dataflow.operators.text2sql import (
SQLComponentClassifier,
SQLExecutionClassifier
)
from dataflow.prompts.text2sql import (
SelectVecSQLGeneratorPrompt,
Text2VecSQLQuestionGeneratorPrompt,
Text2VecSQLPromptGeneratorPrompt
)
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
from dataflow.serving import LocalEmbeddingServing
from dataflow.utils.text2sql.database_manager import DatabaseManager
def download_and_extract_database(logger):
dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-vector-database-example"
subfolder = "databases_vec"
local_dir = "./hf_cache"
extract_to = "./downloaded_databases_vec"
logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
# os.environ['HF_ENDPOINT'] = 'https://alpha.hf-mirror.com'
os.makedirs(local_dir, exist_ok=True)
os.makedirs(extract_to, exist_ok=True)
downloaded_path = snapshot_download(
repo_id=dataset_repo_id,
repo_type="dataset",
local_dir=local_dir,
resume_download=True
)
logger.info(f"Database files downloaded to {downloaded_path}")
zip_path = os.path.join(downloaded_path, "vector_databases.zip")
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
logger.info(f"Database files extracted to {extract_to}")
return extract_to
else:
raise FileNotFoundError(f"Database zip file not found at {zip_path}")
class Text2VecSQLGeneration_APIPipeline():
def __init__(self, db_root_path=""):
self.logger = get_logger()
self.db_root_path = db_root_path
if not db_root_path:
try:
self.db_root_path = download_and_extract_database(self.logger)
self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
except Exception as e:
self.logger.error(f"Failed to auto-download database: {e}")
raise
else:
self.logger.info(f"Using manually specified database path: {self.db_root_path}")
if not os.path.exists(self.db_root_path):
raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
self.storage = FileStorage(
first_entry_file_name="",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
# It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
# cot_generation_api_llm_serving = APILLMServing_request(
# api_url="http://api.openai.com/v1/chat/completions",
# model_name="gpt-4o", # You can change to a more powerful model for CoT generation
# max_workers=100
# )
embedding_serving = LocalEmbeddingServing(
model_name='sentence-transformers/all-MiniLM-L6-v2'
)
# SQLite and MySQL are currently supported
# db_type can be sqlite or mysql, which must match your database type
# If sqlite is selected, root_path must be provided, this path must exist and contain database files
# If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
# MySQL example:
# database_manager = DatabaseManager(
# db_type="mysql",
# config={
# "host": "localhost",
# "user": "root",
# "password": "your_password",
# "database": "your_database_name"
# }
# )
# SQLite example:
database_manager = DatabaseManager(
db_type="sqlite-vec",
config={
"root_path": self.db_root_path,
"model_name": "all-MiniLM-L6-v2",
"model_path": "./hf_cache/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf",
"enable_lembed": False
},
embedding_serving=embedding_serving
)
self.sql_generator_step1 = SQLByColumnGenerator(
llm_serving=self.llm_serving,
database_manager=database_manager,
generate_num=2,
prompt_template=SelectVecSQLGeneratorPrompt()
)
self.sql_execution_filter_step2 = SQLExecutionFilter(
database_manager=database_manager,
)
self.text2sql_question_generator_step3 = Text2SQLQuestionGenerator(
llm_serving=self.llm_serving,
embedding_serving=embedding_serving,
database_manager=database_manager,
question_candidates_num=5,
prompt_template=Text2VecSQLQuestionGeneratorPrompt()
)
self.text2sql_prompt_generator_step4 = Text2SQLPromptGenerator(
database_manager=database_manager,
prompt_template=Text2VecSQLPromptGeneratorPrompt()
)
self.sql_component_classifier_step5 = SQLComponentClassifier(
difficulty_thresholds=[2, 4, 6],
difficulty_labels=['easy', 'medium', 'hard', 'extra']
)
self.sql_execution_classifier_step6 = SQLExecutionClassifier(
llm_serving=self.llm_serving,
database_manager=database_manager,
num_generations=10,
difficulty_thresholds=[2, 5, 9],
difficulty_labels=['extra', 'hard', 'medium', 'easy']
)
def forward(self):
sql_key = "SQL"
db_id_key = "db_id"
question_key = "question"
self.sql_generator_step1.run(
storage=self.storage.step(),
output_sql_key=sql_key,
output_db_id_key=db_id_key
)
self.sql_execution_filter_step2.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key
)
self.text2sql_question_generator_step3.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
output_question_key=question_key
)
self.text2sql_prompt_generator_step4.run(
storage=self.storage.step(),
input_question_key=question_key,
input_db_id_key=db_id_key,
output_prompt_key="prompt"
)
# self.sql_cot_generator_step5.run(
# storage=self.storage.step(),
# input_sql_key=sql_key,
# input_question_key=question_key,
# input_db_id_key=db_id_key,
# output_cot_key="cot_reasoning"
# )
self.sql_component_classifier_step5.run(
storage=self.storage.step(),
input_sql_key=sql_key,
output_difficulty_key="sql_component_difficulty"
)
self.sql_execution_classifier_step6.run(
storage=self.storage.step(),
input_sql_key=sql_key,
input_db_id_key=db_id_key,
input_prompt_key="prompt",
output_difficulty_key="sql_execution_difficulty"
)
if __name__ == "__main__":
# If you have your own database files, you can set the db_root_path to the path of your database files
# If not, please set the db_root_path "", and we will download the example database files automatically
db_root_path = ""
model = Text2VecSQLGeneration_APIPipeline(db_root_path)
model.forward()
from dataflow.operators.conversations import ConsistentChatGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
class TextConversationSynthesis_APIPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
self.model_cache_dir = './dataflow_cache'
self.processor = ConsistentChatGenerator(llm_serving=self.llm_serving, num_dialogs_per_intent=5)
def forward(self):
self.processor.run(
storage=self.storage.step()
)
if __name__ == "__main__":
# This is a test entry point for the TextPipeline
# It will run the forward method of the TextPipeline class
# to process the data and generate the output.
print("Running TextPipeline...")
model = TextConversationSynthesis_APIPipeline()
model.forward()
\ No newline at end of file
from dataflow.operators.text_sft import AlpagasusFilter
from dataflow.operators.text_sft import CondorGenerator
from dataflow.operators.text_sft import CondorRefiner
from dataflow.utils.storage import FileStorage
from dataflow.serving import APILLMServing_request
class TextSFTSynthesis_APIPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.num_generated_samples = 3
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=100
)
self.generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples)
self.refiner = CondorRefiner(llm_serving=self.llm_serving)
self.alpagasus_filter = AlpagasusFilter(min_score=3,max_score=5,llm_serving=self.llm_serving)
def forward(self):
self.generator.run(
storage=self.storage.step()
)
self.refiner.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_output_key='output'
)
self.alpagasus_filter.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_input_key="input",
input_output_key='output'
)
if __name__ == "__main__":
model = TextSFTSynthesis_APIPipeline()
model.forward()
from dataflow.operators.core_text import PromptedEvaluator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class GPT_evaluator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/eval_data.json",
cache_path="./cache_1",
file_name_prefix="math_QA",
cache_type="json",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=10
)
self.prompt_evaluator = PromptedEvaluator(
llm_serving = self.llm_serving,
)
def forward(self):
# Initial filters
self.prompt_evaluator.run(
storage = self.storage.step(),
input_key = "conversations",
output_key = "eval_dim_1",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_evaluator()
model.forward()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment