Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# generate
from .generate.speech2text_generator import Speech2TextGenerator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/core_speech/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/core_speech/", _import_structure)
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
import os
import math
import warnings
import base64
from io import BytesIO
from typing import List, Optional, Union, Dict, Tuple
@OPERATOR_REGISTRY.register()
class Speech2TextGenerator(OperatorABC):
def __init__(
self,
llm_serving: LLMServingABC,
system_prompt: str = "You are a helpful assistant",
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.system_prompt = system_prompt
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,"
"并将转录结果保存到数据框中。\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'\n"
"- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'\n"
"- output_key:输出转录文本的字段名,默认为'generated_content'\n"
"输出参数:\n"
"- 返回输出字段名,用于后续算子引用\n"
"- 在数据框中添加包含转录文本的新列"
)
elif lang == "en":
return (
"This operator transcribes speech content into text. It receives paths or URLs to speech files, "
"uses a large language model for transcription, and saves the transcription results to the dataframe.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- system_prompt: System prompt to define model behavior, default is 'You are a helpful assistant'\n"
"- input_key: Field name for input speech file paths or URLs, default is 'raw_content'\n"
"- output_key: Field name for output transcription text, default is 'generated_content'\n\n"
"Output Parameters:\n"
"- Returns output field name for subsequent operator reference\n"
"- Adds a new column containing transcription text to the dataframe"
)
else:
return (
"SpeechTranscriptor converts speech files to text using a large language model and saves results to a dataframe."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "generated_content"):
self.input_key, self.output_key = input_key, output_key
self.logger.info("Running Speech Transcriptor...")
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
llm_inputs = []
for index, row in dataframe.iterrows():
path_or_url = row.get(self.input_key, '')
llm_inputs.append(path_or_url)
transcriptions = self.llm_serving.generate_from_input(
user_inputs=llm_inputs,
system_prompt=self.system_prompt
)
dataframe[self.output_key] = transcriptions
output_file = storage.write(dataframe)
self.logger.info(f"Saving to {output_file}")
self.logger.info("Speech Transcriptor done")
return output_key
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .generate.prompted_generator import PromptedGenerator
from .generate.prompt_templated_generator import PromptTemplatedGenerator
from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator
from .generate.text2qa_generator import Text2QAGenerator
from .generate.text2multihopqa_generator import Text2MultiHopQAGenerator
from .generate.embedding_generator import EmbeddingGenerator
from .generate.retrieval_generator import RetrievalGenerator
from .eval.bench_dataset_evaluator import BenchDatasetEvaluator
from .eval.bench_dataset_evaluator_question import BenchDatasetEvaluatorQuestion
from .eval.text2qa_sample_evaluator import Text2QASampleEvaluator
from .eval.prompted_eval import PromptedEvaluator
from .filter.prompted_filter import PromptedFilter
from .filter.kcentergreedy_filter import KCenterGreedyFilter
from .filter.general_filter import GeneralFilter
from .refine.prompted_refiner import PromptedRefiner
from .refine.pandas_operator import PandasOperator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/core_text/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/core_text/", _import_structure)
from dataflow.utils.reasoning.AnswerExtraction import StringCleaner, UnitTextManager, AnswerExtractor
from dataflow.prompts.model_evaluation.general import AnswerJudgePrompt
from dataflow.core.prompt import DIYPromptABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import LLMServingABC
from dataflow.core import OperatorABC
from math_verify import parse, verify
from dataflow import get_logger
from typing import Literal
import pandas as pd
import numpy as np
import time
import os
import re
@OPERATOR_REGISTRY.register()
class BenchDatasetEvaluator(OperatorABC):
def __init__(self,
eval_result_path: str = None,
compare_method: Literal["match", "semantic"] = "match",
system_prompt: str = "You are a helpful assistant specialized in evaluating answer correctness.",
llm_serving: LLMServingABC = None,
prompt_template: DIYPromptABC = None):
if eval_result_path is None:
timestamp = int(time.time())
eval_result_path = f"result_bencheval/BenchDatasetEvaluator_result_{timestamp}.json"
self.eval_result_path = eval_result_path
self.compare_method = compare_method
self.empty_responses_count = 0
if compare_method == "match":
self.compare = self.math_verify_compare
unit_manager = UnitTextManager()
string_cleaner = StringCleaner(unit_manager)
self.answer_extractor = AnswerExtractor(string_cleaner)
else:
if prompt_template is None:
prompt_template = AnswerJudgePrompt()
self.prompt_template = prompt_template
self.system_prompt = system_prompt
self.llm_serving = llm_serving
self.logger = get_logger()
def math_verify_compare(self, answer, ground_truth):
try:
return verify(parse(str(ground_truth)), parse(str(answer)))
except:
try:
return verify(parse(ground_truth), parse(answer))
except:
return False
def ResolveResponse(self, response):
if response is None or (isinstance(response, str) and response.strip() == ''):
self.empty_responses_count += 1
return False
try:
pattern = re.compile(r'"judgement_result"\s*:\s*(true|false)', re.IGNORECASE)
match = pattern.search(response)
result_value = None
if match:
result_value = match.group(1).lower()
else:
if "true" in response.lower():
result_value = "true"
else:
result_value = "false"
return result_value == "true"
except Exception as e:
self.logger.error(f"Response format error: {response}. Error: {e}")
return False
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n"
"1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n"
"2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案\n\n"
"输入参数:\n"
"- input_test_answer_key:预测答案字段名\n"
"- input_gt_answer_key:标准答案字段名\n"
"- compare_method:比较方法(match/semantic)\n\n"
"输出参数:\n"
"- answer_match_result:匹配结果(True/False)\n"
"- 统计结果将保存到指定的eval_result_path路径\n"
)
else:
return (
"This operator compares predicted answers against ground truth using two evaluation modes:\n\n"
"1. String Matching (match): Mathematical verification for exact answers.\n"
"2. Semantic Matching (semantic): LLM-based evaluation comparing predicted vs ground truth answers only.\n\n"
"Input Parameters:\n"
"- input_test_answer_key: Predicted answer field\n"
"- input_gt_answer_key: Ground truth field\n"
"- compare_method: Comparison method (match/semantic)\n\n"
"Output Parameters:\n"
"- answer_match_result: Boolean match result\n"
"- Statistics are saved to the specified eval_result_path\n"
)
def check_column(self, required_columns: list[str], dataframe: pd.DataFrame):
for column in required_columns:
if column not in dataframe.columns:
self.logger.error(f"Required column '{column}' not found in dataframe")
return False
return True
def statistic(self, file_name_prefix: str, dataframe: pd.DataFrame, compare_method: str):
total_samples = len(dataframe)
valid_samples = len(dataframe) - self.empty_responses_count
matched_samples = sum(dataframe['answer_match_result'])
accuracy = matched_samples / valid_samples if valid_samples > 0 else 0
stats = {
"bench_name_or_prefix": file_name_prefix,
"total_samples": total_samples,
"valid_samples": valid_samples,
"matched_samples": matched_samples,
"accuracy": float(accuracy),
"empty_responses_count": self.empty_responses_count,
"compare_method": compare_method
}
stats_df = pd.DataFrame([stats])
os.makedirs(os.path.dirname(self.eval_result_path), exist_ok=True)
stats_df.to_json(self.eval_result_path, orient="records", force_ascii=False, indent=2)
self.logger.success(f"Statistics saved to {self.eval_result_path}")
return stats_df
def run(self,
storage: DataFlowStorage,
input_test_answer_key: str = "generated_cot",
input_gt_answer_key: str = "golden_answer") -> list:
dataframe = storage.read("dataframe")
dataframe['answer_match_result'] = False
answers = dataframe[input_test_answer_key]
ground_truths = dataframe[input_gt_answer_key]
if self.compare_method == "match":
if not self.check_column(
required_columns=[input_test_answer_key, input_gt_answer_key],
dataframe=dataframe
):
return [input_test_answer_key, input_gt_answer_key]
for i in range(len(answers)):
final_answer = self.answer_extractor.extract_answer(answers[i], None)
dataframe.at[i, 'answer_match_result'] = self.compare(final_answer, ground_truths[i])
storage.write(dataframe)
self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
return [input_test_answer_key, input_gt_answer_key, 'answer_match_result']
else:
if not self.check_column(
required_columns=[input_test_answer_key, input_gt_answer_key],
dataframe=dataframe
):
return [input_test_answer_key, input_gt_answer_key]
empty_reference_mask = dataframe[input_gt_answer_key].isna() | (dataframe[input_gt_answer_key] == '')
valid_rows = dataframe[~empty_reference_mask]
skipped_count = len(dataframe[empty_reference_mask])
if len(valid_rows) == 0:
self.logger.warning("No valid reference answers found. All samples skipped.")
storage.write(dataframe)
return [input_test_answer_key, input_gt_answer_key, 'answer_match_result']
# 仅用预测答案与标准答案构建Prompt
inputs = [
self.prompt_template.build_prompt(
answer=row[input_test_answer_key],
reference_answer=row[input_gt_answer_key]
)
for _, row in valid_rows.iterrows()
]
responses = self.llm_serving.generate_from_input(
user_inputs=inputs,
system_prompt=self.system_prompt
)
results = [self.ResolveResponse(response) for response in responses]
for i, idx in enumerate(valid_rows.index):
dataframe.at[idx, 'answer_match_result'] = results[i]
storage.write(dataframe)
self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
self.empty_responses_count = 0
return [input_test_answer_key, input_gt_answer_key, 'answer_match_result']
from email.policy import strict
from dataflow.utils.reasoning.AnswerExtraction import StringCleaner, UnitTextManager, AnswerExtractor
from dataflow.prompts.model_evaluation.general import AnswerJudgePromptQuestion, AnswerJudgeMultipleQuestionsPrompt
from dataflow.core.prompt import DIYPromptABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import LLMServingABC
from dataflow.core import OperatorABC
from math_verify import parse, verify
from dataflow import get_logger
from typing import Literal
import pandas as pd
import numpy as np
import time
import os # 添加os模块导入
import re
import json
import json5
@OPERATOR_REGISTRY.register()
class BenchDatasetEvaluatorQuestion(OperatorABC):
def __init__(self,
eval_result_path: str = None,
compare_method: Literal["match", "semantic"] = "match",
system_prompt: str = "You are a helpful assistant specialized in evaluating answer correctness.",
llm_serving: LLMServingABC = None,
prompt_template: DIYPromptABC = None,
support_subquestions: bool = False
):
if eval_result_path is None:
timestamp = int(time.time())
eval_result_path = f"result_bencheval/BenchDatasetEvaluator_result_{timestamp}.json"
self.eval_result_path = eval_result_path
self.compare_method = compare_method
self.empty_responses_count = 0 # 添加空响应计数器
if compare_method == "match":
self.compare = self.math_verify_compare
unit_manager = UnitTextManager()
string_cleaner = StringCleaner(unit_manager)
self.answer_extractor = AnswerExtractor(string_cleaner)
else:
if prompt_template is None:
prompt_template = AnswerJudgePromptQuestion() if not support_subquestions else AnswerJudgeMultipleQuestionsPrompt()
self.prompt_template = prompt_template
self.system_prompt = system_prompt
self.llm_serving = llm_serving
self.support_subquestions = support_subquestions
self.logger = get_logger()
def math_verify_compare(self, answer, ground_truth):
try:
return verify(parse(str(ground_truth)), parse(str(answer)))
except:
try:
return verify(parse(ground_truth), parse(answer))
except:
return False
def ResolveResponse(self, response):
# 检查空响应
if not self.support_subquestions:
if response is None or (isinstance(response, str) and response.strip() == ''):
self.empty_responses_count += 1
return False
try:
pattern = re.compile(r'"judgement_result"\s*:\s*(true|false)', re.IGNORECASE)
match = pattern.search(response)
result_value = None
if match:
result_value = match.group(1).lower()
else:
# 备用解析逻辑,检查响应中是否包含true或false
if "true" in response.lower():
result_value = "true"
else:
result_value = "false"
if result_value == "true":
return True
else:
return False
except Exception as e:
self.logger.error(f"Response format error: {response}. Error: {e}")
return False
if self.support_subquestions:
# 如果支持子问题,假设response是一个列表, 返回正确的数量/总数
correct_num = 0
total_num = 0
try:
response = json5.loads(response, strict=False) # 使用json5解析,允许更宽松的格式
judgement = response.get("judgement", [])
except Exception as e:
self.logger.error(f"Response JSON parse error: {response}. Error: {e}")
self.empty_responses_count += 1
return "0/0"
for resp in judgement:
if isinstance(resp, bool):
if resp is True:
correct_num += 1
total_num += 1
elif resp is False:
total_num += 1
elif resp.lower() == "empty":
continue # 不计入总数
elif isinstance(resp, str):
if resp.lower() == "true":
correct_num += 1
total_num += 1
elif resp.lower() == "false":
total_num += 1
elif resp.lower() == "empty":
continue # 不计入总数
return f"{correct_num}/{total_num}"
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n"
"1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n"
"2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题\n\n"
"输入参数:\n"
"- input_test_answer_key:预测答案字段名\n"
"- input_gt_answer_key:标准答案字段名\n"
"- input_question_key:问题字段名(语义匹配模式下必需)\n"
"- compare_method:比较方法(match/semantic)\n\n"
"输出参数:\n"
"- answer_match_result:匹配结果(True/False)\n"
"- 统计结果将保存到指定的eval_result_path路径\n"
)
elif lang == "en":
return (
"This operator compares predicted answers against ground truth using two evaluation modes:\n\n"
"1. String Matching (match): Uses mathematical verification to compare answers, suitable for questions with definitive answers\n"
"2. Semantic Matching (semantic): Uses LLM to evaluate semantic similarity, suitable for open-ended questions\n\n"
"Input Parameters:\n"
"- input_test_answer_key: Predicted answer field\n"
"- input_gt_answer_key: Ground truth field\n"
"- input_question_key: Question field (required for semantic mode)\n"
"- compare_method: Comparison method (match/semantic)\n\n"
"Output Parameters:\n"
"- answer_match_result: Matching result (True/False)\n"
"- Statistics will be saved to the specified eval_result_path\n"
)
else:
return "BenchEvaluator performs answer validation using string matching or semantic comparison"
def check_column(self, required_columns: list[str], dataframe: pd.DataFrame):
for column in required_columns:
if column not in dataframe.columns:
self.logger.error(f"Required column '{column}' not found in dataframe")
return False
return True
def statistic(self, file_name_prefix: str, dataframe: pd.DataFrame, compare_method: Literal["match", "semantic"]):
total_samples = len(dataframe)
valid_samples = len(dataframe) - self.empty_responses_count
matched_samples = sum(dataframe['answer_match_result'])
accuracy = matched_samples / valid_samples if valid_samples > 0 else 0
# 创建统计信息字典
stats = {
"bench_name_or_prefix": file_name_prefix,
"total_samples": total_samples,
"valid_samples": valid_samples,
"matched_samples": matched_samples,
"accuracy": float(accuracy), # 确保可以被JSON序列化
"empty_responses_count": self.empty_responses_count,
"compare_method": compare_method
}
if self.support_subquestions:
total_subquestions = dataframe['total_subquestions'].sum()
correct_subquestions = dataframe['correct_answer_num'].sum()
subquestion_accuracy = correct_subquestions / total_subquestions if total_subquestions > 0 else 0
stats.update({
"total_subquestions": int(total_subquestions),
"correct_subquestions": int(correct_subquestions),
"subquestion_accuracy": float(subquestion_accuracy)
})
# 将字典转换为DataFrame
stats_df = pd.DataFrame([stats])
# 直接将统计信息写入到self.eval_result_path
os.makedirs(os.path.dirname(self.eval_result_path), exist_ok=True)
stats_df.to_json(self.eval_result_path, orient="records", force_ascii=False, indent=2)
self.logger.success(f"Statistics saved to {self.eval_result_path}")
return stats_df
def run(
self,
storage:DataFlowStorage,
input_test_answer_key: str = "generated_cot",
input_gt_answer_key: str = "golden_answer",
input_question_key: str = None,
) -> list:
self.test_answer_key = input_test_answer_key
self.gt_answer_key = input_gt_answer_key
self.question_key = input_question_key
dataframe = storage.read("dataframe")
dataframe['answer_match_result'] = False
answers = dataframe[self.test_answer_key]
ground_truths = dataframe[self.gt_answer_key]
if self.compare_method == "match":
if self.check_column(
required_columns=[input_test_answer_key,input_gt_answer_key],
dataframe=dataframe
) is False:
return required_columns
for i in range(len(answers)):
final_answer = self.answer_extractor.extract_answer(answers[i], None)
if self.compare(final_answer, ground_truths[i]):
dataframe.at[i, 'answer_match_result'] = True
else:
dataframe.at[i, 'answer_match_result'] = False
output_file = storage.write(dataframe)
# 生成统计信息并直接写入JSON文件
stats = self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
return [self.test_answer_key, self.gt_answer_key, 'answer_match_result']
else:
if self.check_column(
required_columns=[input_test_answer_key,input_gt_answer_key, input_question_key],
dataframe=dataframe
) is False:
return required_columns
empty_reference_mask = dataframe[input_gt_answer_key].isna() | (dataframe[input_gt_answer_key] == '')
skipped_rows = dataframe[empty_reference_mask]
valid_rows = dataframe[~empty_reference_mask]
skipped_count = len(skipped_rows)
if len(valid_rows) == 0:
self.logger.warning("No valid samples with reference answers found. All samples skipped.")
if self.keep_all_samples:
output_file = storage.write(dataframe) # 保留所有行,但answer_match_result都为False
else:
output_file = storage.write(pd.DataFrame(columns=dataframe.columns)) # 不保留任何行
self.logger.info(f"Dataframe saved to {output_file}. Skipped {skipped_count} samples due to missing reference answers.")
return required_columns + ['answer_match_result']
# 只对有参考答案的行构建提示词并调用LLM
inputs = [self.prompt_template.build_prompt(
question=row[input_question_key],
answer=row[input_test_answer_key],
reference_answer=row[input_gt_answer_key]
) for _, row in valid_rows.iterrows()]
responses = self.llm_serving.generate_from_input(user_inputs=inputs, system_prompt=self.system_prompt)
# if self.support_subquestions:
# # 每个response是一个列表,连接一个长列表,比如[["true", "false"], ["true"]] -> ["true", "false", "true"]
# responses = [item for sublist in responses for item in sublist]
results = [self.ResolveResponse(response) for response in responses]
# 创建结果掩码,与valid_rows长度相同
result_mask = np.array(results, dtype=bool)
# 更新有效行的answer_match_result
valid_indices = valid_rows.index
if not self.support_subquestions:
for i, idx in enumerate(valid_indices):
dataframe.at[idx, 'answer_match_result'] = results[i]
else:
for i, idx in enumerate(valid_indices):
correct_answer_num = int(results[i].split('/')[0])
total_subquestions = int(results[i].split('/')[1])
dataframe.at[idx, 'correct_answer_num'] = correct_answer_num
dataframe.at[idx, 'total_subquestions'] = total_subquestions
dataframe.at[idx, 'answer_match_result'] = (correct_answer_num == total_subquestions) and (total_subquestions > 0) # 全对为True,否则为False
dataframe.at[idx, 'response_evaluation'] = responses[i] # 保存LLM的原始响应内容
output_file = storage.write(dataframe)
# 生成统计信息并直接写入JSON文件
stats = self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
# 重置空响应计数器
self.empty_responses_count = 0
return [input_test_answer_key, input_gt_answer_key, input_question_key, 'answer_match_result']
\ No newline at end of file
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
import re
@OPERATOR_REGISTRY.register()
class PromptedEvaluator(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self, llm_serving: LLMServingABC, system_prompt: str = "Please evaluate the quality of this data on a scale from 1 to 5."):
self.logger = get_logger()
self.llm_serving = llm_serving
self.system_prompt = system_prompt
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 "
"storage 持久化)。模型应只输出分数(整数)。\n"
"功能:对每行输入文本生成一个评分。\n"
"输入参数:\n"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。\n"
"- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。\n"
"- input_key:输入文本所在列名(默认:'raw_content')。\n"
"- output_key:评分结果写入的列名(默认:'eval')。\n"
"输出:\n"
"- 返回输出列名(用于后续算子引用),评分结果已写回并保存。"
)
elif lang == "en":
return (
"PromptedEvaluator: uses an LLM to rate data quality and writes the score back to the "
"DataFrame (persisted via storage). The model is expected to output only the integer score.\n"
"Purpose: for each input row, produce an score.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC.\n"
"- system_prompt: system prompt (default: 'Please evaluate the quality of this data on a scale from 1 to 5.').\n"
"- input_key: column name containing input text (default: 'raw_content').\n"
"- output_key: column name to store scores (default: 'eval').\n"
"Output:\n"
"- Returns the output column name for downstream operators; the scored DataFrame is saved."
)
else:
return "PromptedEvaluator rates data quality (1–5) from input text and stores the integer score."
def _parse_scores(self, outputs: list[str]) -> list[int]:
"""
将模型输出的分数字符串转为整数。
- 成功提取到 1–5 范围内的分数 → 返回该分数
- 提取失败或不合法 → 返回 0
"""
results = []
for out in outputs:
score = 0
try:
if out is None:
results.append(0)
continue
text = str(out).strip()
# 用正则找第一个数字
match = re.search(r"\d+", text)
if match:
val = int(match.group())
if 1 <= val <= 5:
score = val
# 否则默认 0
except Exception:
score = 0
results.append(score)
return results
def eval(self, dataframe, input_key):
llm_inputs = []
for index, row in dataframe.iterrows():
raw_content = row.get(input_key, '')
if raw_content:
llm_input = self.system_prompt + str(raw_content) + 'Please only output the score!'
llm_inputs.append(llm_input)
try:
self.logger.info("Generating text using the model...")
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
scores = self._parse_scores(generated_outputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
return scores
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "eval"):
self.logger.info("Running PromptGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
generated_outputs = self.eval(dataframe, input_key)
# Add the generated content back to the dataframe
dataframe[output_key] = generated_outputs
# Save the updated dataframe to the output file
output_file = storage.write(dataframe)
return output_key
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
import re
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict
from dataflow.prompts.text2qa import (
Text2QAQuestionQualityPrompt,
Text2QAAnswerAlignmentPrompt,
Text2QAAnswerVerifiabilityPrompt,
Text2QADownstreamValuePrompt
)
@prompt_restrict(
Text2QAQuestionQualityPrompt,
Text2QAAnswerAlignmentPrompt,
Text2QAAnswerVerifiabilityPrompt,
Text2QADownstreamValuePrompt
)
@OPERATOR_REGISTRY.register()
class Text2QASampleEvaluator(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self,
llm_serving: LLMServingABC,
# prompt_template = None # prompt is fix
):
self.logger = get_logger()
self.llm_serving = llm_serving
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于为给的的文档片段生成种子QA对打分\n\n"
"输入参数:\n"
"- input_question_key: Field name containing the generated question\n"
"- input_answer_key: Field name containing the generated answer\n"
"- output_question_quality_key: Field name containing the question quality grade\n"
"- output_question_quality_feedback_key: Field name containing the question quality feedback\n"
"- output_answer_alignment_key: Field name containing the answer alignment grade\n"
"- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n"
"- output_answer_verifiability_key: Field name containing the answer verifiability grade\n"
"- output_downstream_value_key: Field name containing the downstream value grade\n"
"- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n"
)
elif lang == "en":
return (
"This operator generates prompts for given document fragments to generate seed QA pairs.\n\n"
"Input Parameters:\n"
"- input_question_key: Field name containing the generated question\n"
"- input_answer_key: Field name containing the generated answer\n"
"- output_question_quality_key: Field name containing the question quality grade\n"
"- output_question_quality_feedback_key: Field name containing the question quality feedback\n"
"- output_answer_alignment_key: Field name containing the answer alignment grade\n"
"- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n"
"- output_answer_verifiability_key: Field name containing the answer verifiability grade\n"
"- output_downstream_value_key: Field name containing the downstream value grade\n"
"- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n"
)
else:
return "QAScorer scores QA pairs for given document fragments."
def _validate_dataframe(self, dataframe: pd.DataFrame):
required_keys = [self.input_question_key, self.input_answer_key]
forbidden_keys = [self.output_question_quality_key, self.output_question_quality_feedback_key, self.output_answer_alignment_key, self.output_answer_alignment_feedback_key, self.output_answer_verifiability_key, self.output_answer_verifiability_feedback_key, self.output_downstream_value_key, self.output_downstream_value_feedback_key]
missing = [k for k in required_keys if k not in dataframe.columns]
conflict = [k for k in forbidden_keys if k in dataframe.columns]
if missing:
raise ValueError(f"Missing required column(s): {missing}")
if conflict:
raise ValueError(f"The following column(s) already exist and would be overwritten: {conflict}")
def _build_prompts(self, dataframe):
"""
Reformat the prompts in the dataframe to generate questions.
"""
question_quality_inputs = []
self.prompts = Text2QAQuestionQualityPrompt()
question_quality_prompt = self.prompts.build_prompt()
answer_alignment_inputs = []
self.prompts = Text2QAAnswerAlignmentPrompt()
answer_alignment_prompt = self.prompts.build_prompt()
answer_verifiability_inputs = []
self.prompts = Text2QAAnswerVerifiabilityPrompt()
answer_verifiability_prompt = self.prompts.build_prompt()
downstream_value_inputs = []
self.prompts = Text2QADownstreamValuePrompt()
downstream_value_prompt = self.prompts.build_prompt()
for index, row in dataframe.iterrows():
question_quality_content = question_quality_prompt + "Question: " + row[self.input_question_key] + "\n" + "Answer: " + row[self.input_answer_key]
question_quality_inputs.append(question_quality_content)
answer_alignment_content = answer_alignment_prompt + "Question: " + row[self.input_question_key] + "\n" + "Answer: " + row[self.input_answer_key]
answer_alignment_inputs.append(answer_alignment_content)
answer_verifiability_content = answer_verifiability_prompt + "Question: " + row[self.input_question_key] + "\n" + "Answer: " + row[self.input_answer_key]
answer_verifiability_inputs.append(answer_verifiability_content)
downstream_value_content = downstream_value_prompt + "Question: " + row[self.input_question_key] + "\n" + "Answer: " + row[self.input_answer_key]
downstream_value_inputs.append(downstream_value_content)
return question_quality_inputs, answer_alignment_inputs, answer_verifiability_inputs, downstream_value_inputs
def _parse_grade_and_feedback(self, response: str) -> tuple:
grading_match = re.search(r"\*\*Grading\*\*:\s*(\d+)", response)
feedback_match = re.search(r"\*\*Feedback\*\*:\s*(.+)", response, re.DOTALL)
grading = float(grading_match.group(1)) if grading_match else 0
feedback = feedback_match.group(1).strip() if feedback_match else ''
return grading, feedback
def run(
self,
storage: DataFlowStorage,
input_question_key: str = "generated_question",
input_answer_key: str = "generated_answer",
output_question_quality_key: str = "question_quality_grades",
output_question_quality_feedback_key: str = "question_quality_feedbacks",
output_answer_alignment_key: str = "answer_alignment_grades",
output_answer_alignment_feedback_key: str = "answer_alignment_feedbacks",
output_answer_verifiability_key: str = "answer_verifiability_grades",
output_answer_verifiability_feedback_key: str = "answer_verifiability_feedbacks",
output_downstream_value_key: str = "downstream_value_grades",
output_downstream_value_feedback_key: str = "downstream_value_feedbacks"
):
self.input_question_key, self.input_answer_key, self.output_question_quality_key, self.output_question_quality_feedback_key, self.output_answer_alignment_key, self.output_answer_alignment_feedback_key, self.output_answer_verifiability_key, self.output_answer_verifiability_feedback_key, self.output_downstream_value_key, self.output_downstream_value_feedback_key = input_question_key, input_answer_key, output_question_quality_key, output_question_quality_feedback_key, output_answer_alignment_key, output_answer_alignment_feedback_key, output_answer_verifiability_key, output_answer_verifiability_feedback_key, output_downstream_value_key, output_downstream_value_feedback_key
dataframe = storage.read("dataframe")
self._validate_dataframe(dataframe)
# 构建prompt
q_inputs, a_inputs, v_inputs, d_inputs = self._build_prompts(dataframe)
# 生成四类分数和反馈
self.logger.info("Scoring question quality...")
q_scores = self.llm_serving.generate_from_input(user_inputs=q_inputs, system_prompt="")
q_grades, q_feedbacks = zip(*[self._parse_grade_and_feedback(r) for r in q_scores])
self.logger.info("Scoring answer alignment...")
a_scores = self.llm_serving.generate_from_input(user_inputs=a_inputs, system_prompt="")
a_grades, a_feedbacks = zip(*[self._parse_grade_and_feedback(r) for r in a_scores])
self.logger.info("Scoring answer verifiability...")
v_scores = self.llm_serving.generate_from_input(user_inputs=v_inputs, system_prompt="")
v_grades, v_feedbacks = zip(*[self._parse_grade_and_feedback(r) for r in v_scores])
self.logger.info("Scoring downstream value...")
d_scores = self.llm_serving.generate_from_input(user_inputs=d_inputs, system_prompt="")
d_grades, d_feedbacks = zip(*[self._parse_grade_and_feedback(r) for r in d_scores])
# 写回结果
dataframe[self.output_question_quality_key] = q_grades
dataframe[self.output_question_quality_feedback_key] = q_feedbacks
dataframe[self.output_answer_alignment_key] = a_grades
dataframe[self.output_answer_alignment_feedback_key] = a_feedbacks
dataframe[self.output_answer_verifiability_key] = v_grades
dataframe[self.output_answer_verifiability_feedback_key] = v_feedbacks
dataframe[self.output_downstream_value_key] = d_grades
dataframe[self.output_downstream_value_feedback_key] = d_feedbacks
output_file = storage.write(dataframe)
self.logger.info(f"Results saved to {output_file}")
return [
output_question_quality_key, output_question_quality_feedback_key,
output_answer_alignment_key, output_answer_alignment_feedback_key,
output_answer_verifiability_key, output_answer_verifiability_feedback_key,
output_downstream_value_key, output_downstream_value_feedback_key
]
\ No newline at end of file
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
import pandas as pd
@OPERATOR_REGISTRY.register()
class GeneralFilter(OperatorABC):
def __init__(self, filter_rules: list):
self.logger = get_logger()
self.filter_rules = filter_rules
self.logger.info(f"Initializing {self.__class__.__name__} with rules: {self.filter_rules}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。\n\n"
"每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,"
"用于指定保留哪些行。\n\n"
"输入参数:\n"
"- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,"
"需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。\n\n"
"示例:\n"
" - lambda df: df['score'] > 0.5\n"
" - lambda df: df['label'].isin(['A', 'B'])"
)
elif lang == "en":
return (
"This operator applies custom filtering functions to a DataFrame.\n\n"
"Each filter rule is a function (e.g., lambda expression) that takes a DataFrame "
"and returns a boolean Series indicating which rows to retain.\n\n"
"Input Parameters:\n"
"- filter_rules: A list of functions, each in the form of lambda df: ..., "
"returning a boolean Series of the same length as the DataFrame. "
"All rules are combined using logical AND.\n\n"
"Examples:\n"
" - lambda df: df['score'] > 0.5\n"
" - lambda df: df['label'].isin(['A', 'B'])"
)
else:
return "GeneralFilter filters DataFrame rows using a list of functions returning boolean Series."
def _validate_dataframe(self, dataframe: pd.DataFrame):
required_keys = [self.input_key]
forbidden_keys = []
missing = [k for k in required_keys if k not in dataframe.columns]
conflict = [k for k in forbidden_keys if k in dataframe.columns]
if missing:
raise ValueError(f"Missing required column(s): {missing}")
if conflict:
raise ValueError(f"The following column(s) already exist and would be overwritten: {conflict}")
def run(self,
storage: DataFlowStorage,
):
df = storage.read("dataframe")
mask = pd.Series(True, index=df.index)
for rule_fn in self.filter_rules:
if not callable(rule_fn):
raise ValueError("Each filter rule must be a callable(e.g., lambda df: ...)")
cond = rule_fn(df)
if not isinstance(cond, pd.Series) or cond.dtype != bool:
raise ValueError("Each filter function must return a boolean Series")
mask &= cond
filtered_df = df[mask]
self.logger.info(f"Filtering complete. Remaining rows: {len(filtered_df)}")
storage.write(filtered_df)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_df)}.")
return ""
import numpy as np
import pandas as pd
import random
import torch
from torch import Tensor
from typing import List, Optional
import torch.nn.functional as F
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC, LLMServingABC
class KCenterGreedy:
"""Implements k-center-greedy method.
Args:
embedding (Tensor): Embedding vector extracted from a LLM
sampling_ratio (float): Ratio to choose coreset size from the embedding size.
Example:
>>> embedding.shape
torch.Size([219520, 1536])
>>> sampler = KCenterGreedy(embedding=embedding)
>>> sampled_idxs = sampler.select_coreset_idxs()
>>> coreset = embedding[sampled_idxs]
>>> coreset.shape
torch.Size([219, 1536])
"""
def __init__(self, embedding: Tensor, sampling_ratio: float) -> None:
self.embedding = embedding
self.coreset_size = int(embedding.shape[0] * sampling_ratio)
# self.model = SparseRandomProjection(eps=0.9)
self.features: Tensor
self.min_distances: Tensor = torch.tensor([])
self.n_observations = self.embedding.shape[0]
def reset_distances(self) -> None:
"""Reset minimum distances."""
self.min_distances = torch.tensor([])
def update_distances(self, cluster_centers: List[int]) -> None:
"""Update min distances given cluster centers.
Args:
cluster_centers (List[int]): indices of cluster centers
"""
if cluster_centers:
centers = self.features[cluster_centers]
distance = F.pairwise_distance(self.features, centers, p=2).reshape(-1, 1)
if self.min_distances.shape[0] == 0:
self.min_distances = distance
else:
self.min_distances = torch.minimum(self.min_distances, distance)
def get_new_idx(self) -> int:
"""Get index value of a sample.
Based on minimum distance of the cluster
Returns:
int: Sample index
"""
if isinstance(self.min_distances, Tensor):
idx = int(torch.argmax(self.min_distances).item())
else:
raise ValueError(f"self.min_distances must be of type Tensor. Got {type(self.min_distances)}")
return idx
def select_coreset_idxs(self, selected_idxs: Optional[List[int]] = None) -> List[int]:
"""Greedily form a coreset to minimize the maximum distance of a cluster.
Args:
selected_idxs: index of samples already selected. Defaults to an empty set.
Returns:
indices of samples selected to minimize distance to cluster centers
"""
if selected_idxs is None:
selected_idxs = []
if self.embedding.ndim == 2:
# self.model.fit(self.embedding)
# self.features = self.model.transform(self.embedding)
self.features = self.embedding
self.reset_distances()
else:
self.features = self.embedding.reshape(self.embedding.shape[0], -1)
self.update_distances(cluster_centers=selected_idxs)
selected_coreset_idxs: List[int] = []
idx = int(torch.randint(high=self.n_observations, size=(1,)).item())
cnt = 0
for _ in range(self.coreset_size):
cnt += 1
if(cnt % 1000 == 0):
print(cnt)
self.update_distances(cluster_centers=[idx])
idx = self.get_new_idx()
if idx in selected_idxs:
raise ValueError("New indices should not be in selected indices.")
self.min_distances[idx] = 0
selected_coreset_idxs.append(idx)
return selected_coreset_idxs
def sample_coreset(self, selected_idxs: Optional[List[int]] = None) -> Tensor:
"""Select coreset from the embedding.
Args:
selected_idxs: index of samples already selected. Defaults to an empty set.
Returns:
Tensor: Output coreset
Example:
>>> embedding.shape
torch.Size([219520, 1536])
>>> sampler = KCenterGreedy(...)
>>> coreset = sampler.sample_coreset()
>>> coreset.shape
torch.Size([219, 1536])
"""
idxs = self.select_coreset_idxs(selected_idxs)
coreset = self.embedding[idxs]
return coreset
@OPERATOR_REGISTRY.register()
class KCenterGreedyFilter(OperatorABC):
def __init__(self, num_samples: int, embedding_serving : LLMServingABC = None):
self.num_samples = num_samples
self.embedding_serving = embedding_serving
self.logger = get_logger()
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对\n\n"
"输入参数:\n"
"- input_key: 包含文档片段的字段名\n"
"- embedding_model_path: 嵌入模型路径\n"
"- num_samples: 选取的文档片段数量\n"
"- method: 选择方法,随机或k-center-greedy\n\n"
)
elif lang == "en":
return (
"This operator chooses document fragments for seed QA pairs.\n\n"
"Input Parameters:\n"
"- input_key: Field name containing the content\n"
"- embedding_serving: Embedding serving\n"
"- num_samples: Number of document fragments to select\n"
"- method: Selection method, random or k-center-greedy\n\n"
"Output Parameters:\n"
"- Returns 1 for valid content, 0 otherwise"
)
else:
return "ContentChooser chooses document fragments for seed QA pairs"
def _validate_dataframe(self, dataframe: pd.DataFrame):
required_keys = [self.input_key]
forbidden_keys = []
missing = [k for k in required_keys if k not in dataframe.columns]
conflict = [k for k in forbidden_keys if k in dataframe.columns]
if missing:
self.logger.error(f"Missing required column(s): {missing}")
if conflict:
self.logger.error(f"The following column(s) already exist and would be overwritten: {conflict}")
missing_keys = [key for key in required_keys if key not in dataframe.columns]
if missing_keys:
self.logger.error(f"The following required columns are missing from the dataframe: {missing_keys}")
def run(
self,
storage:DataFlowStorage,
input_key: str = "content",
) -> list:
'''
Execute the answer format filter process
'''
self.input_key = input_key
dataframe = storage.read("dataframe")
self._validate_dataframe(dataframe)
texts = dataframe[self.input_key].tolist()
indexes = np.zeros(len(dataframe)).astype(int)
embeddings_list = self.embedding_serving.generate_embedding_from_input(texts)
embeddings = torch.tensor(embeddings_list)
sampler = KCenterGreedy(embedding=embeddings, sampling_ratio= self.num_samples / len(texts))
chooss_indexes = sampler.select_coreset_idxs()
for index in chooss_indexes:
indexes[index] = 1
dataframe = dataframe[np.array(indexes) == 1]
output_file = storage.write(dataframe)
self.logger.info(f"Results saved to {output_file}")
return [self.input_key,]
\ No newline at end of file
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.operators.core_text import PromptedEvaluator
@OPERATOR_REGISTRY.register()
class PromptedFilter(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self, llm_serving: LLMServingABC, system_prompt: str = "Please evaluate the quality of this data on a scale from 1 to 5.", min_score = 1, max_score = 5):
self.logger = get_logger()
self.llm_serving = llm_serving
self.prompted_evaluator = PromptedEvaluator(llm_serving, system_prompt)
self.min_score = min_score
self.max_score = max_score
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,"
"并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。"
"默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。\n"
"\n输入参数:\n"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n"
"- system_prompt:系统提示词,定义评估规范(可选,默认 "
"'Please evaluate the quality of this data on a scale from 1 to 5.')\n"
"- input_key:待评估文本所在列名(默认 'raw_content')\n"
"- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)\n"
"- min_score:筛选的最小分(默认 5)\n"
"- max_score:筛选的最大分(默认 5)\n"
"\n输出参数:\n"
"- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)\n"
"- 返回 output_key 以供后续算子引用\n"
"\n备注:\n"
"- 默认打分区间是 1–5,但可根据实际 prompt 改变。"
)
elif lang == "en":
return (
"PromptedFilter leverages PromptedEvaluator to assign numeric scores to input data, "
"and filters rows whose scores fall within [min_score, max_score] (inclusive). "
"By default, the scoring scale is 1–5, but this can be customized through system_prompt.\n"
"\nInput Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC\n"
"- system_prompt: System prompt defining the evaluation criteria "
"(default: 'Please evaluate the quality of this data on a scale from 1 to 5.')\n"
"- input_key: Column name containing the text to evaluate (default 'raw_content')\n"
"- output_key: Column name to store the score (default 'eval'; overwritten if it exists)\n"
"- min_score: Minimum score for filtering (default 5)\n"
"- max_score: Maximum score for filtering (default 5)\n"
"\nOutput:\n"
"- Filtered DataFrame (rows with scores in [min_score, max_score])\n"
"- Returns output_key for downstream operators\n"
"\nNote:\n"
"- Default scoring range is 1–5, but can vary depending on the system_prompt."
)
else:
return "PromptedFilter scores rows via PromptedEvaluator and filters by a configurable score range (default 1–5)."
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "eval"):
self.logger.info("Running PromptGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
generated_outputs = self.prompted_evaluator.eval(dataframe, input_key)
# Add the generated content back to the dataframe
dataframe[output_key] = generated_outputs
filtered_dataframe = dataframe[(dataframe[output_key] >= self.min_score) & (dataframe[output_key] <= self.max_score)]
# Save the updated dataframe to the output file
output_file = storage.write(filtered_dataframe)
return output_key
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
@OPERATOR_REGISTRY.register()
class EmbeddingGenerator(OperatorABC):
'''
Embedding Generator is a class that generates answers for given input text.
'''
def __init__(self,
embedding_serving: LLMServingABC,
):
self.logger = get_logger()
self.embedding_serving = embedding_serving
self.logger.info(f"Initializing {self.__class__.__name__}...")
self.logger.info(f"{self.__class__.__name__} initialized.")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),"
"通常用于语义检索、聚类或下游模型输入等任务。\n\n"
"输入参数:\n"
"- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示\n"
"- input_key:输入文本字段名,默认为'text'\n"
"- output_key:输出向量字段名,默认为'embeddings'\n\n"
"输出参数:\n"
"- 包含文本向量的DataFrame,每行对应一个输入文本的embedding\n"
"- 返回输出字段名(如'embeddings'),可供后续算子引用"
)
elif lang == "en":
return (
"The EmbeddingGenerator operator generates vector representations (embeddings) "
"from input text, typically used for semantic retrieval, clustering, or downstream model inputs.\n\n"
"Input Parameters:\n"
"- embedding_serving: Embedding service object implementing the LLMServingABC interface for generating text embeddings\n"
"- input_key: Field name for input text, default is 'text'\n"
"- output_key: Field name for output embeddings, default is 'embeddings'\n\n"
"Output Parameters:\n"
"- DataFrame containing text embeddings, where each row corresponds to one input text\n"
"- Returns the output field name (e.g., 'embeddings') for subsequent operator reference"
)
else:
return (
"EmbeddingGenerator generates vector embeddings from text input for retrieval or representation learning tasks."
)
def run(self,
storage: DataFlowStorage,
input_key: str = "text",
output_key: str = "embeddings",
):
dataframe = storage.read("dataframe")
self.input_key = input_key
self.output_key = output_key
texts = dataframe[self.input_key].tolist()
embeddings_list = self.embedding_serving.generate_embedding_from_input(texts)
# embeddings = torch.tensor(embeddings_list)
dataframe[self.output_key] = embeddings_list
output_file = storage.write(dataframe)
self.logger.info(f"Results saved to {output_file}")
return [self.output_key]
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
import string
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict, PromptABC, DIYPromptABC
from typing import Union, Any, Set
from dataflow.prompts.core_text import StrFormatPrompt
@prompt_restrict(
StrFormatPrompt
)
@OPERATOR_REGISTRY.register()
class PromptTemplatedGenerator(OperatorABC):
def __init__(
self,
llm_serving: LLMServingABC,
prompt_template: Union[StrFormatPrompt, DIYPromptABC] = None,
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.prompt_template = prompt_template
if prompt_template is None:
raise ValueError("prompt_template cannot be None")
def run(
self,
storage: DataFlowStorage,
output_key: str = "generated_content",
**input_keys: Any
):
self.storage: DataFlowStorage = storage
self.output_key = output_key
self.logger.info("Running PromptTemplatedGenerator...")
self.input_keys = input_keys
need_fields = set(input_keys.keys())
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
llm_inputs = []
for idx, row in dataframe.iterrows():
key_dict = {}
for key in need_fields:
key_dict[key] = row[input_keys[key]]
prompt_text = self.prompt_template.build_prompt(need_fields, **key_dict)
llm_inputs.append(prompt_text)
self.logger.info(f"Prepared {len(llm_inputs)} prompts for LLM generation.")
# Create a list to hold all generated contents
# Generate content using the LLM serving
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
dataframe[self.output_key] = generated_outputs
output_file = self.storage.write(dataframe)
return output_key
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于模板化提示词(Prompt Template)生成内容的算子。"
"该算子使用用户定义的提示模板(StrFormatPrompt 或 DIYPrompt),"
"结合输入数据中的字段自动构造完整提示词并调用大语言模型生成结果。\n\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口,用于执行文本生成任务\n"
"- prompt_template:提示词模板对象(StrFormatPrompt 或 DIYPromptABC),用于定义提示结构\n"
"- input_keys:输入字段映射字典,用于将DataFrame中的列名映射到模板字段\n"
"- output_key:输出生成内容字段名,默认为'generated_content'\n\n"
"输出参数:\n"
"- 包含生成结果的新DataFrame\n"
"- 返回输出字段名,以便后续算子引用\n\n"
"使用场景:\n"
"适用于需要通过模板化提示构建多样输入、批量生成文本内容的场景,例如标题生成、摘要生成、问答模板填充等。"
)
elif lang == "en":
return (
"An operator for content generation based on templated prompts. "
"This operator uses a user-defined prompt template (StrFormatPrompt or DIYPromptABC) "
"to automatically construct full prompts from input data fields and generate outputs via an LLM.\n\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface, responsible for text generation\n"
"- prompt_template: Prompt template object (StrFormatPrompt or DIYPromptABC) defining the prompt structure\n"
"- input_keys: Dictionary mapping DataFrame column names to template fields\n"
"- output_key: Field name for generated content, default is 'generated_content'\n\n"
"Output Parameters:\n"
"- DataFrame containing generated outputs\n"
"- Returns the output field name for downstream operator reference\n\n"
"Use Case:\n"
"Ideal for tasks requiring templated prompt-driven generation, such as title generation, text summarization, or Q&A filling."
)
else:
return (
"PromptTemplatedGenerator generates text based on a user-defined prompt template."
)
\ No newline at end of file
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
@OPERATOR_REGISTRY.register()
class PromptedGenerator(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self,
llm_serving: LLMServingABC,
system_prompt: str = "You are a helpful agent.",
json_schema: dict = None,
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.json_schema = json_schema
self.system_prompt = system_prompt
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'\n"
"- input_key:输入内容字段名,默认为'raw_content'\n"
"- output_key:输出生成内容字段名,默认为'generated_content'\n"
"输出参数:\n"
"- 包含生成内容的DataFrame\n"
"- 返回输出字段名,用于后续算子引用"
)
elif lang == "en":
return (
"Generate data from user-provided prompts. Combines system prompt and input content to generate desired output text.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- system_prompt: System prompt to define model behavior, default is 'You are a helpful agent.'\n"
"- input_key: Field name for input content, default is 'raw_content'\n"
"- output_key: Field name for output generated content, default is 'generated_content'\n\n"
"Output Parameters:\n"
"- DataFrame containing generated content\n"
"- Returns output field name for subsequent operator reference"
)
else:
return (
"PromptedGenerator generates text based on system prompt and input content."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "generated_content"):
self.input_key, self.output_key = input_key, output_key
self.logger.info("Running PromptGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
llm_inputs = []
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for index, row in dataframe.iterrows():
raw_content = row.get(self.input_key, '')
if raw_content:
llm_input = self.system_prompt + str(raw_content)
llm_inputs.append(llm_input)
# Generate the text using the model
try:
self.logger.info("Generating text using the model...")
if self.json_schema is not None:
generated_outputs = self.llm_serving.generate_from_input(llm_inputs, json_schema = self.json_schema)
else:
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
# Add the generated content back to the dataframe
dataframe[self.output_key] = generated_outputs
# Save the updated dataframe to the output file
output_file = storage.write(dataframe)
return output_key
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict, DIYPromptABC
from dataflow.prompts.general_text import SFTFromScratchGeneratorPrompt
from typing import Union
@prompt_restrict(
SFTFromScratchGeneratorPrompt
)
@OPERATOR_REGISTRY.register()
class RandomDomainKnowledgeRowGenerator(OperatorABC):
def __init__(
self,
llm_serving: LLMServingABC,
generation_num : int,
domain_keys : str,
prompt_template : Union[SFTFromScratchGeneratorPrompt, DIYPromptABC]= None,
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.prompt_template = prompt_template
self.generation_num = generation_num
self.domain_keys = domain_keys
@staticmethod
def get_desc(lang: str = "zh"):
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"RandomDomainKnowledgeRowGenerator算子用于结合提示模板(prompt_template)与LLM服务对象(llm_serving),"
"批量生成与指定领域相关的文本内容。\n\n"
"功能说明:\n"
"- 结合SFTFromScratchGeneratorPrompt模板,根据domain_keys随机选择领域并生成内容;\n"
"- 当输入DataFrame为空时,可通过generation_num参数控制生成样本数量;\n"
"- 生成的文本结果将写入指定字段(output_key),并返回该字段名供后续算子使用。\n\n"
"参数说明:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口;\n"
"- prompt_template:提示模板实例,需为SFTFromScratchGeneratorPrompt类型;\n"
"- storage:DataFlowStorage对象,用于读取与写入数据;\n"
"- output_key:生成结果写入的字段名,默认为'generated_content';\n"
"- generation_num:生成内容数量,默认为1;\n"
"- domain_keys:指定或限制生成内容所属领域。\n\n"
"输出说明:\n"
"- 返回值:输出字段名(output_key),供后续算子引用;\n"
"- 同时将包含生成内容的新DataFrame写回至存储。"
)
elif lang == "en":
return (
"The RandomDomainKnowledgeRowGenerator operator generates domain-related text content "
"by combining a prompt template (prompt_template) with an LLM serving instance (llm_serving).\n\n"
"Function Description:\n"
"- Utilizes the SFTFromScratchGeneratorPrompt template to randomly select domains via domain_keys;\n"
"- Supports content generation when no input DataFrame is available, controlled by generation_num;\n"
"- Generated text is written to the specified output field (output_key), and the field name is returned.\n\n"
"Parameter Description:\n"
"- llm_serving: LLM serving object implementing the LLMServingABC interface;\n"
"- prompt_template: Prompt template instance of type SFTFromScratchGeneratorPrompt;\n"
"- storage: DataFlowStorage object used for reading and writing data;\n"
"- output_key: Name of the field to write generated results (default: 'generated_content');\n"
"- generation_num: Number of contents to generate when there is no input data (default: 1);\n"
"- domain_keys: Domain key(s) specifying or constraining the generation domain; empty string for random.\n\n"
"Output Description:\n"
"- Returns the output field name (output_key) for downstream reference;\n"
"- Writes the DataFrame containing generated content back to storage."
)
else:
return (
"RandomDomainKnowledgeRowGenerator算子用于结合提示模板(prompt_template)与LLM服务对象(llm_serving),批量生成领域文本内容。"
)
def run(self, storage: DataFlowStorage, output_key: str = "generated_content"):
"""
主流程:基于输入数据和提示词生成文本内容。
参数说明:
- storage: DataFlowStorage对象,用于读写数据;
- output_key: 输出字段名,默认为'generated_content';
- generation_num: 生成内容的数量,默认为1;
返回:
- 输出字段名(output_key),供后续算子引用。
"""
self.output_key = output_key
self.logger.info("Running RandomDomainKnowledgeRowGenerator...")
# 从存储中读取DataFrame
dataframe = storage.read('dataframe')
self.logger.info(f"Loaded data, number of rows: {len(dataframe)}")
llm_inputs = []
# 按generation_num生成指定数量的输入
for i in range(self.generation_num):
llm_inputs.append(self.prompt_template.build_prompt(self.domain_keys))
try:
self.logger.info("Generating text using the model...")
# 调用LLM服务生成文本
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
# 将生成的内容写入DataFrame新列
dataframe[self.output_key] = generated_outputs
# 将结果写回存储
output_file = storage.write(dataframe)
return output_key
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.serving.light_rag_serving import LightRAGServing
@OPERATOR_REGISTRY.register()
class RetrievalGenerator(OperatorABC):
def __init__(self,
llm_serving: LightRAGServing,
system_prompt: str = "You are a helpful agent.",
json_schema: dict = None,
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.json_schema = json_schema
self.system_prompt = system_prompt
async def run(self,
storage: DataFlowStorage,
input_key: str = "raw_content",
output_key: str = "generated_content",
):
self.input_key, self.output_key = input_key, output_key
self.logger.info("Running RetrievalGenerator...")
# Load the raw dataframe from the input file
df = storage.read('dataframe')
self.logger.info(f"Loading, number of tasks: {len(df)}")
llm_inputs = []
for index, row in df.iterrows():
raw_content = row.get(self.input_key, '')
if raw_content:
llm_input = str(raw_content)
llm_inputs.append(llm_input)
try:
self.logger.info("Generating text using the model...")
generated_outputs = await self.llm_serving.generate_from_input(llm_inputs, self.system_prompt)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
df[self.output_key] = generated_outputs
output_file = storage.write(df)
return output_key
\ No newline at end of file
from dataflow.prompts.text2qa import Text2MultiHopQAGeneratorPrompt
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
import random
from typing import Any, Dict, List, Optional, Sequence
import json
from tqdm import tqdm
import re
from dataflow.core.prompt import prompt_restrict, DIYPromptABC
from typing import Union
import re
@prompt_restrict(
Text2MultiHopQAGeneratorPrompt
)
@OPERATOR_REGISTRY.register()
class Text2MultiHopQAGenerator(OperatorABC):
r"""A processor for generating multi-hop question-answer pairs from user
data.
This class handles the processing of text data to generate multi-hop
question-answer pairs using either an AI model or rule-based approaches.
It manages the entire pipeline from text preprocessing to dataset curation.
"""
def __init__(self,
llm_serving: LLMServingABC,
seed: int = 0,
lang="en",
prompt_template : Union[Text2MultiHopQAGeneratorPrompt, DIYPromptABC] = None,
num_q = 5
):
r"""Initialize the UserDataProcessor.
Args:
config (Optional[ProcessorConfig], optional): Configuration for
data processing. (default: :obj:`None`)
"""
self.rng = random.Random(seed)
self.llm_serving = llm_serving
self.lang = lang
self.logger = get_logger()
self.num_q = num_q
if prompt_template:
self.prompt_template = prompt_template
else:
self.prompt_template = Text2MultiHopQAGeneratorPrompt(lang=self.lang)
@staticmethod
def get_desc(lang: str = "zh") -> tuple:
"""Returns a description of the processor's functionality.
Args:
lang (str, optional): Language for description ('zh' or 'en').
Returns:
tuple: Description strings in specified language, including format example
"""
if lang == "zh":
return (
"MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。",
"处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。",
"输出格式如下:",
"输入:\n"
"text: <原始上下文文本>",
"输出:\n"
"{\n"
" \"text\": <处理后的文本字符串>,\n"
" \"qa_pairs\": [\n"
" {\n"
" \"question\": <字符串:生成的问题>,\n"
" \"reasoning_steps\": [\n"
" {\"step\": <推理过程的步骤 1>},\n"
" {\"step\": <步骤 2>} ...\n"
" ],\n"
" \"answer\": <字符串:最终答案>,\n"
" \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\n"
" \"type\": <可选:问题类型,如“生物学”、“历史”等>\n"
" },\n"
" ...\n"
" ],\n"
" \"metadata\": {\n"
" \"source\": <数据来源>,\n"
" \"timestamp\": <时间戳字符串>,\n"
" \"complexity\": <整数:问题复杂度标记>\n"
" }\n"
"}"
)
else:
return (
"MultiHopQAGenerator is a processor for generating multi-hop question-answer pairs from raw text.",
"It includes preprocessing, information extraction, and reasoning-based QA generation, with configurable LLM backends.",
"Expected output format:",
"Input:\n"
"text: <raw input context>",
"Output:\n"
"{\n"
" \"text\": <processed input text>,\n"
" \"qa_pairs\": [\n"
" {\n"
" \"question\": <string: generated question>,\n"
" \"reasoning_steps\": [\n"
" {\"step\": <inference step 1>},\n"
" {\"step\": <inference step 2>} ...\n"
" ],\n"
" \"answer\": <string: final answer>,\n"
" \"supporting_facts\": [<fact 1>, <fact 2>, ...],\n"
" \"type\": <optional string: QA category>\n"
" },\n"
" ...\n"
" ],\n"
" \"metadata\": {\n"
" \"source\": <source string>,\n"
" \"timestamp\": <timestamp string>,\n"
" \"complexity\": <integer: reasoning complexity>\n"
" }\n"
"}"
)
def process_text(
self, text: str, source: str = "user_input"
) -> List[Dict[str, Any]]:
r"""Process a single text to generate multi-hop QA pairs.
Args:
text (str): The input text to process.
source (str, optional): Source identifier for the text.
(default: :obj:`"user_input"`)
Returns:
List[Dict[str, Any]]: List of processed examples with QA pairs and
metadata.
"""
# Convert text to standard format
raw_data = [
{
'text': text,
'source': source,
}
]
# Construct examples
constructor = ExampleConstructor(lang=self.lang, llm_serving=self.llm_serving)
examples = constructor.construct_examples(raw_data)
# Manage data
# curator = DataCurator(self.config, self.rng)
# final_dataset = curator.curate_dataset(examples)
return examples
def process_batch(
self, texts: List[str], sources: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
r"""Process multiple texts in batch to generate multi-hop QA pairs.
Args:
texts (List[str]): List of input texts to process.
sources (Optional[List[str]], optional): List of source
identifiers. (default: :obj:`None`)
Returns:
List[Dict[str, Any]]: List of processed examples with QA pairs and
metadata.
Raises:
ValueError: If length of sources doesn't match length of texts.
"""
if sources is None:
sources = ["default_source"] * len(texts)
elif len(sources) != len(texts):
raise ValueError("Length of sources must match length of texts")
raw_data = [
{
'text': text,
'source': source,
}
for text, source in zip(texts, sources)
]
# Construct examples
constructor = ExampleConstructor(
lang=self.lang,
llm_serving=self.llm_serving,
prompt_template = self.prompt_template
)
examples = constructor.construct_examples(raw_data)
# # Manage data
# curator = DataCurator(self.config, self.rng)
# final_dataset = curator.curate_dataset(examples)
return examples
def _validate_dataframe(self, dataframe: pd.DataFrame):
required_keys = [self.input_key]
forbidden_keys = [self.output_key]
missing = [k for k in required_keys if k not in dataframe.columns]
conflict = [k for k in forbidden_keys if k in dataframe.columns]
if missing:
raise ValueError(f"Missing required column(s): {missing}")
if conflict:
raise ValueError(f"The following column(s) already exist and would be overwritten: {conflict}")
def run(
self,
storage: DataFlowStorage=None,
input_key:str='cleaned_chunk',
output_key:str='QA_pairs',
output_meta_key:str='QA_metadata',
):
self.input_key, self.output_key, self.output_meta_key = input_key, output_key, output_meta_key
dataframe = storage.read("dataframe")
self._validate_dataframe(dataframe)
texts = dataframe[self.input_key].tolist()
outputs=self.process_batch(texts)
dataframe[self.output_key] = [
output['qa_pairs'][:self.num_q] if len(output['qa_pairs']) >= self.num_q else output['qa_pairs']
for output in outputs
]
dataframe[self.output_meta_key] = [output['metadata'] for output in outputs]
output_file = storage.write(dataframe)
self.logger.info(f"Results saved to {output_file}")
return [output_key]
class ExampleConstructor:
r"""Constructs training examples from raw text data.
This class handles the construction of training examples by preprocessing
text, extracting information pairs, and generating question-answer pairs.
"""
def __init__(
self,
lang: str = "en",
llm_serving: LLMServingABC = None,
min_text_length: int = 100,
max_text_length: int = 200000,
prompt_template = None
):
r"""Initialize the ExampleConstructor.
Args:
config (ProcessorConfig): Configuration for example construction.
multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
Agent for generating multi-hop QA pairs. (default: :obj:`None`)
"""
self.lang = lang
self.llm_sering = llm_serving
self.logger = get_logger()
self.max_length = max_text_length
self.min_length = min_text_length
# self.prompt = Text2MultiHopQAGeneratorPrompt(lang=self.lang)
if prompt_template:
self.prompt_template = prompt_template
else:
self.prompt_template = Text2MultiHopQAGeneratorPrompt(lang=self.lang)
def construct_examples(
self, raw_data: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
r"""Construct training examples from raw data.
Args:
raw_data (List[Dict[str, Any]]): List of raw data dictionaries
containing text and metadata.
Returns:
List[Dict[str, Any]]: List of constructed examples with QA pairs
and metadata.
"""
self.logger.info("Starting to construct examples...")
examples = []
for data in tqdm(raw_data, desc="Constructing examples"):
# 1. Text preprocessing
processed_text = self._preprocess_text(data.get('text', ''))
if not processed_text:
example = {
# 'text': processed_text,
'qa_pairs': [],
'metadata': {
'source': data.get('source', 'unknown'),
'timestamp': data.get('timestamp', ''),
'complexity': 0,
},
}
examples.append(example)
continue
# 2. Generate key information pairs
info_pairs = self._extract_info_pairs(processed_text)
# 3. Construct question-answer pairs
if(info_pairs):
qa_pairs = self._generate_qa_pairs(info_pairs)
else:
qa_pairs = []
# 4. Add metadata
example = {
# 'text': processed_text,
'qa_pairs': qa_pairs,
'metadata': {
'source': data.get('source', 'unknown'),
'timestamp': data.get('timestamp', ''),
'complexity': self._calculate_complexity(qa_pairs) if qa_pairs else 0,
},
}
examples.append(example)
self.logger.info(f"Successfully constructed {len(examples)} examples")
return examples
def _preprocess_text(self, text: str) -> str:
r"""Preprocess input text for example construction.
Args:
text (str): Input text to preprocess.
Returns:
str: Preprocessed text, or empty string if text fails quality
checks.
"""
if not isinstance(text, str):
return ''
# 1. Basic cleaning
text = text.strip()
# 2. Length check
if (
len(text) < self.min_length
or len(text) > self.max_length
):
self.logger.warning("text fail to pass length check.")
return ''
# 3. Quality check
if not self._check_text_quality(text):
self.logger.warning("text fail to pass quality check.")
return ''
return text
def _calculate_special_char_ratio(self,text):
# 中文字符的Unicode范围(基本汉字+扩展)
chinese_ranges = [
(0x4E00, 0x9FFF), # 基本汉字
(0x3400, 0x4DBF), # 扩展A
(0x20000, 0x2A6DF), # 扩展B
(0x2A700, 0x2B73F), # 扩展C
(0x2B740, 0x2B81F), # 扩展D
(0x2B820, 0x2CEAF) # 扩展E
]
special_count = 0
for c in text:
# 检查是否为中文、字母数字或空格
is_chinese = any(start <= ord(c) <= end for start, end in chinese_ranges)
if not (c.isalnum() or c.isspace() or is_chinese):
special_count += 1
return special_count / len(text) if text else 0
def _check_text_quality(self, text: str) -> bool:
r"""Check the quality of input text.
Args:
text (str): Text to check quality for.
Returns:
bool: True if text passes quality checks, False otherwise.
"""
# 1. Basic quality check
if (text.count('。') < 2 and text.count('.') < 2): # Must have at least 2 sentences
return False
# 2. Special character ratio check
special_char_ratio = self._calculate_special_char_ratio(text)
if special_char_ratio > 0.3: # No more than 30% special characters
return False
return True
def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
r"""Extract information pairs and relationships from text.
Args:
text (str): Input text to extract information from.
Returns:
List[Dict[str, Sequence[str]]]: List of dictionaries containing
premise, intermediate, conclusion, and related contexts.
"""
# Split into sentences
if(self.lang=="en"):
sentences = [s.strip() for s in text.split('.') if s.strip()]
else:
sentences = [s.strip() for s in text.split('。') if s.strip()]
info_pairs = []
# Extract combinations of multiple related sentences
for i in range(len(sentences) - 2):
if len(sentences[i]) > 10 and len(sentences[i + 1]) > 10:
info_pairs.append(
{
'premise': sentences[i],
'intermediate': sentences[i + 1],
'conclusion': sentences[i + 2]
if i + 2 < len(sentences)
else '',
'related_contexts': [
s
for j, s in enumerate(sentences)
if j != i and j != i + 1 and len(s) > 10
][:2],
# Limit to 2 additional related contexts
}
)
return info_pairs
def _generate_qa_pairs(
self, info_pairs: List[Dict[str, Sequence[str]]]
) -> List[Dict[str, str]]:
r"""Generate multi-hop question-answer pairs from information pairs.
Args:
info_pairs (List[Dict[str, Sequence[str]]]): List of information
pairs extracted from text.
Returns:
List[Dict[str, str]]: List of generated QA pairs.
"""
user_inputs=[]
for pair in info_pairs:
# 1. Generate multi-hop question-answer pair using AI
# Construct full context
context = (
f"{pair['premise']}. {pair['intermediate']}."
f" {pair['conclusion']}"
)
user_inputs.append(self.prompt_template.build_prompt(context))
sys_prompt=self.prompt_template.build_system_prompt()
responses = self.llm_sering.generate_from_input(user_inputs=user_inputs,system_prompt=sys_prompt)
qa_pairs=self._extract_qa_pairs(responses)
return qa_pairs
def _extract_qa_pairs(self, responses: List[str]) -> List[Dict[str, Any]]:
"""
从原始响应中精确提取符合结构的QA对
自动跳过非法JSON和干扰文本
"""
qa_pairs = []
for response in responses:
# self.logger.info(f"generated qa: {response}")
# 方法1:尝试直接解析整个响应为JSON
try:
qa_pair = json.loads(response)
if isinstance(qa_pair, dict) and "question" in qa_pair:
qa_pairs.append(qa_pair)
continue
elif isinstance(qa_pair, list):
for item in qa_pair:
if isinstance(item, dict) and "question" in item:
qa_pairs.append(item)
continue
except json.JSONDecodeError:
pass
# 方法2:使用正则表达式查找所有JSON对象
try:
# 查找所有以 { 开始的JSON对象
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
# 更精确的模式,匹配完整的JSON对象
brace_count = 0
start_pos = -1
json_objects = []
for i, char in enumerate(response):
if char == '{':
if brace_count == 0:
start_pos = i
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0 and start_pos != -1:
json_str = response[start_pos:i+1]
json_objects.append(json_str)
start_pos = -1
# 尝试解析找到的每个JSON字符串
for json_str in json_objects:
try:
qa_pair = json.loads(json_str)
if (isinstance(qa_pair, dict) and \
"question" in qa_pair and \
"reasoning_steps" in qa_pair and \
"answer" in qa_pair and \
"supporting_facts" in qa_pair and \
"type" in qa_pair):
qa_pairs.append(qa_pair)
self.logger.info(f"Successfully extracted QA pair: {qa_pair['question']}")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse JSON object: {json_str[:100]}... Error: {e}")
continue
# 对qa_pairs中重复的question进行去重
if qa_pairs:
seen_questions = set()
unique_qa_pairs = []
for qa_pair in qa_pairs:
question = qa_pair.get("question", "").strip().lower()
if question and question not in seen_questions:
seen_questions.add(question)
unique_qa_pairs.append(qa_pair)
self.logger.debug(f"Added unique question: {qa_pair['question']}")
else:
self.logger.debug(f"Skipped duplicate question: {qa_pair.get('question', 'N/A')}")
qa_pairs = unique_qa_pairs
self.logger.info(f"After deduplication: {len(qa_pairs)} unique QA pairs")
# 如果没有找到有效的JSON对象,记录警告
if not json_objects:
self.logger.warning("No JSON objects found in model response.")
except Exception as e:
self.logger.warning(f"Failed to parse QA information from model response. Error: {e}")
return qa_pairs
def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
r"""Calculate the complexity score for a set of QA pairs.
Args:
qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
complexity for.
Returns:
float: Complexity score between 0.0 and 1.0.
"""
if not qa_pairs:
return 0.0
# Calculate complexity based on multiple factors
complexities = []
for qa in qa_pairs:
# 1. Number of reasoning steps
reasoning_steps_count = len(qa.get('reasoning_steps', []))
# 2. Number of supporting facts
supporting_facts_count = len(qa.get('supporting_facts', []))
# 3. Question length
question_length = len(qa.get('question', '').split())
# 4. Answer length
answer_length = len(qa.get('answer', '').split())
# Calculate complexity of a single QA pair
qa_complexity = (
min(reasoning_steps_count / 3, 1.0)
* 0.4 # Weight for reasoning steps
+ min(supporting_facts_count / 3, 1.0)
* 0.3 # Weight for supporting facts
+ min(question_length / 20, 1.0)
* 0.15 # Weight for question length
+ min(answer_length / 50, 1.0) * 0.15
# Weight for answer length
)
complexities.append(qa_complexity)
return sum(complexities) / len(complexities)
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict
import ast
import json
from dataflow.prompts.text2qa import Text2QASeedQuestionGeneratorPrompt,Text2QAAutoPromptGeneratorPrompt
@prompt_restrict(
Text2QAAutoPromptGeneratorPrompt,
Text2QASeedQuestionGeneratorPrompt
)
@OPERATOR_REGISTRY.register()
class Text2QAGenerator:
'''
SeedQAGenerator is a class that uses LLMs to generate QA pairs based on seed input.
'''
def __init__(self,
llm_serving: LLMServingABC,
# prompt_template = None # prompt is fix
):
self.logger = get_logger()
self.llm_serving = llm_serving
self.prompt_template = Text2QAAutoPromptGeneratorPrompt()
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于为给定的文档片段生成种子QA对。\n\n"
"输入参数:\n"
"- input_key: 包含文档片段的字段名\n"
"- prompt_key: 包含提示词的字段名\n"
"- output_quesion_key: 包含生成问题的字段名\n"
"- output_answer_key: 包含生成答案的字段名\n"
)
elif lang == "en":
return (
"This operator generates seed QA pairs for given document fragments.\n\n"
"Input Parameters:\n"
"- input_key: Field name containing the content\n"
"- prompt_key: Field name containing the generated prompt\n"
"- output_quesion_key: Field name containing the generated question\n"
"- output_answer_key: Field name containing the generated answer\n"
)
else:
return "QAGenerator generates QA pairs for given document fragments."
def _validate_dataframe(self, dataframe: pd.DataFrame):
required_keys = [self.input_key]
forbidden_keys = [self.output_question_key, self.output_answer_key]
missing = [k for k in required_keys if k not in dataframe.columns]
conflict = [k for k in forbidden_keys if k in dataframe.columns]
if missing:
raise ValueError(f"Missing required column(s): {missing}")
if conflict:
raise ValueError(f"The following column(s) already exist and would be overwritten: {conflict}")
def _build_prompt(self, df, types):
if types == "prompt":
self.prompt_template = Text2QAAutoPromptGeneratorPrompt()
texts = df[self.input_key].tolist()
output = [self.prompt_template.build_prompt(text) for text in texts]
elif types == "qa":
self.prompt_template = Text2QASeedQuestionGeneratorPrompt()
output = []
for index, row in df.iterrows():
output.append(row[self.output_prompt_key] + self.prompt_template.build_prompt() + row[self.input_key])
return output
def _parse_qa(self, response: str) -> tuple:
lines = response.strip().split('\n')
q = next((line[2:].strip() for line in lines if line.lower().startswith("q:")), "")
a = next((line[2:].strip() for line in lines if line.lower().startswith("a:")), "")
return q, a
def parse_list_string(self, s: str) -> list:
# 去掉前后的 [ ]
s = s.strip()[1:-1]
# 去掉多余逗号并按 , 切分
items = [item.strip() for item in s.split(",") if item.strip()]
return items
def run(
self,
storage: DataFlowStorage,
input_key:str = "text",
input_question_num:int = 1,
output_prompt_key:str = "generated_prompt",
output_quesion_key:str = "generated_question",
output_answer_key:str = "generated_answer"
):
'''
Runs the QA generation process, reading from the input file and saving results to output.
'''
self.input_key, self.input_question_num, self.output_prompt_key, self.output_question_key, self.output_answer_key = input_key, input_question_num, output_prompt_key, output_quesion_key, output_answer_key
dataframe = storage.read("dataframe")
self._validate_dataframe(dataframe)
formatted_prompts = self._build_prompt(dataframe, "prompt")
raw_prompts = self.llm_serving.generate_from_input(
user_inputs=formatted_prompts,
system_prompt=""
)
prompts = []
for i, p in enumerate(raw_prompts):
try:
prompts.append(json.loads(p))
except json.JSONDecodeError:
self.logger.warning(f"Failed to parse prompt at index {i}: {p}")
continue
expanded_rows = []
expanded_prompts = []
for idx, prompt_list in enumerate(prompts):
for p in prompt_list[:min(self.input_question_num,len(prompt_list))]:
expanded_rows.append(dataframe.iloc[idx].to_dict()) # 复制该行
expanded_prompts.append(p) # 对应的 prompt
dataframe = pd.DataFrame(expanded_rows)
dataframe[self.output_prompt_key] = expanded_prompts
formatted_prompts = self._build_prompt(dataframe, "qa")
responses = self.llm_serving.generate_from_input(user_inputs=formatted_prompts, system_prompt="")
questions, answers = zip(*[self._parse_qa(r) for r in responses])
dataframe[self.output_question_key] = questions
dataframe[self.output_answer_key] = answers
output_file = storage.write(dataframe)
self.logger.info(f"Results saved to {output_file}")
return [self.output_question_key, self.output_answer_key]
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
import pandas as pd
@OPERATOR_REGISTRY.register()
class PandasOperator(OperatorABC):
def __init__(self, process_fn: list):
self.logger = get_logger()
self.process_fn = process_fn
self.logger.info(f"Initializing {self.__class__.__name__} with transform functions: {self.process_fn}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。\n\n"
"每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。\n\n"
"输入参数:\n"
"- process_fn:一个函数列表,每个函数形式为 lambda df: ...,"
"必须返回一个 DataFrame。\n\n"
"示例:\n"
" - lambda df: df.assign(score2=df['score'] * 2)\n"
" - lambda df: df.sort_values('score', ascending=False)"
)
elif lang == "en":
return (
"This operator applies a list of transformation functions to a DataFrame.\n\n"
"Each function (typically a lambda) takes a DataFrame and returns a modified DataFrame.\n\n"
"Input Parameters:\n"
"- process_fn: A list of functions, each in the form of lambda df: ..., "
"and must return a DataFrame.\n\n"
"Examples:\n"
" - lambda df: df.assign(score2=df['score'] * 2)\n"
" - lambda df: df.sort_values('score', ascending=False)"
)
else:
return "Applies a sequence of transformation functions to a DataFrame."
def run(self, storage: DataFlowStorage):
df = storage.read("dataframe")
for fn in self.process_fn:
if not callable(fn):
raise ValueError("Each transform function must be callable (e.g., lambda df: ...)")
df = fn(df)
if not isinstance(df, pd.DataFrame):
raise ValueError("Each transform function must return a DataFrame")
self.logger.info(f"Transformation complete. Final shape: {df.shape}")
storage.write(df)
return ""
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
@OPERATOR_REGISTRY.register()
class PromptedRefiner(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self, llm_serving: LLMServingABC, system_prompt: str = "You are a helpful agent."):
self.logger = get_logger()
self.llm_serving = llm_serving
self.system_prompt = system_prompt
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,"
"并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 "
"`system_prompt + raw_content` 作为模型输入,批量生成改写结果。\n"
"\n输入参数:\n"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n"
"- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')\n"
"- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列\n"
"\n输出参数:\n"
"- 覆盖后的 DataFrame(同名列被改写后的文本)\n"
"- 无返回值(结果已通过 DataFlowStorage 写出)\n"
"\n备注:\n"
"- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。\n"
"- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。"
)
elif lang == "en":
return (
"PromptedRefiner rewrites/refines/normalizes text in a specified column **in place**, "
"using a provided system_prompt. For each row it concatenates "
"`system_prompt + raw_content` as the model input and generates the refined text.\n"
"\nInput Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC\n"
"- system_prompt: Instruction describing the rewrite goal/style (default 'You are a helpful agent.')\n"
"- input_key: Column to refine (default 'raw_content'); the refined output **overwrites** this column\n"
"\nOutput:\n"
"- DataFrame with the same column overwritten by refined text\n"
"- No return value (the result is written via DataFlowStorage)\n"
"\nNotes:\n"
"- This operator **overwrites** the input_key column; copy it first if you need to keep originals.\n"
"- Each row is expected to provide text in input_key; missing/empty rows won’t form inputs, which may cause "
"length-mismatch errors on assignment."
)
else:
return (
"PromptedRefiner rewrites a chosen column in place using `system_prompt + raw_content` as input."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content"):
self.input_key = input_key
self.logger.info("Running PromptGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
llm_inputs = []
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for index, row in dataframe.iterrows():
raw_content = row.get(self.input_key, '')
if raw_content:
llm_input = self.system_prompt + str(raw_content)
llm_inputs.append(llm_input)
# Generate the text using the model
try:
self.logger.info("Generating text using the model...")
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
# Add the generated content back to the dataframe
dataframe[self.input_key] = generated_outputs
# Save the updated dataframe to the output file
output_file = storage.write(dataframe)
return
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# generate
from .generate.prompted_vqa_generator import PromptedVQAGenerator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/core_vision/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/core_vision/", _import_structure)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment