Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
DataFlow
Commits
97e8278b
Commit
97e8278b
authored
Dec 03, 2025
by
zzg_666
Browse files
适配后端vllm
parents
Pipeline
#3071
canceled with stages
Changes
385
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2628 additions
and
0 deletions
+2628
-0
dataflow/operators/core_speech/__init__.py
dataflow/operators/core_speech/__init__.py
+14
-0
dataflow/operators/core_speech/generate/speech2text_generator.py
...w/operators/core_speech/generate/speech2text_generator.py
+81
-0
dataflow/operators/core_text/__init__.py
dataflow/operators/core_text/__init__.py
+27
-0
dataflow/operators/core_text/eval/bench_dataset_evaluator.py
dataflow/operators/core_text/eval/bench_dataset_evaluator.py
+201
-0
dataflow/operators/core_text/eval/bench_dataset_evaluator_question.py
...rators/core_text/eval/bench_dataset_evaluator_question.py
+299
-0
dataflow/operators/core_text/eval/prompted_eval.py
dataflow/operators/core_text/eval/prompted_eval.py
+112
-0
dataflow/operators/core_text/eval/text2qa_sample_evaluator.py
...flow/operators/core_text/eval/text2qa_sample_evaluator.py
+175
-0
dataflow/operators/core_text/filter/general_filter.py
dataflow/operators/core_text/filter/general_filter.py
+79
-0
dataflow/operators/core_text/filter/kcentergreedy_filter.py
dataflow/operators/core_text/filter/kcentergreedy_filter.py
+219
-0
dataflow/operators/core_text/filter/prompted_filter.py
dataflow/operators/core_text/filter/prompted_filter.py
+81
-0
dataflow/operators/core_text/generate/embedding_generator.py
dataflow/operators/core_text/generate/embedding_generator.py
+72
-0
dataflow/operators/core_text/generate/prompt_templated_generator.py
...perators/core_text/generate/prompt_templated_generator.py
+102
-0
dataflow/operators/core_text/generate/prompted_generator.py
dataflow/operators/core_text/generate/prompted_generator.py
+90
-0
dataflow/operators/core_text/generate/random_domain_knowledge_row_generator.py
...re_text/generate/random_domain_knowledge_row_generator.py
+118
-0
dataflow/operators/core_text/generate/retrieval_generator.py
dataflow/operators/core_text/generate/retrieval_generator.py
+53
-0
dataflow/operators/core_text/generate/text2multihopqa_generator.py
...operators/core_text/generate/text2multihopqa_generator.py
+601
-0
dataflow/operators/core_text/generate/text2qa_generator.py
dataflow/operators/core_text/generate/text2qa_generator.py
+146
-0
dataflow/operators/core_text/refine/pandas_operator.py
dataflow/operators/core_text/refine/pandas_operator.py
+52
-0
dataflow/operators/core_text/refine/prompted_refiner.py
dataflow/operators/core_text/refine/prompted_refiner.py
+92
-0
dataflow/operators/core_vision/__init__.py
dataflow/operators/core_vision/__init__.py
+14
-0
No files found.
Too many changes to show.
To preserve performance only
385 of 385+
files are displayed.
Plain diff
Email patch
dataflow/operators/core_speech/__init__.py
0 → 100644
View file @
97e8278b
from
typing
import
TYPE_CHECKING
if
TYPE_CHECKING
:
# generate
from
.generate.speech2text_generator
import
Speech2TextGenerator
else
:
import
sys
from
dataflow.utils.registry
import
LazyLoader
,
generate_import_structure_from_type_checking
cur_path
=
"dataflow/operators/core_speech/"
_import_structure
=
generate_import_structure_from_type_checking
(
__file__
,
cur_path
)
sys
.
modules
[
__name__
]
=
LazyLoader
(
__name__
,
"dataflow/operators/core_speech/"
,
_import_structure
)
dataflow/operators/core_speech/generate/speech2text_generator.py
0 → 100644
View file @
97e8278b
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
import
os
import
math
import
warnings
import
base64
from
io
import
BytesIO
from
typing
import
List
,
Optional
,
Union
,
Dict
,
Tuple
@
OPERATOR_REGISTRY
.
register
()
class
Speech2TextGenerator
(
OperatorABC
):
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
system_prompt
:
str
=
"You are a helpful assistant"
,
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
system_prompt
=
system_prompt
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,"
"并将转录结果保存到数据框中。
\n
"
"输入参数:
\n
"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口
\n
"
"- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'
\n
"
"- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'
\n
"
"- output_key:输出转录文本的字段名,默认为'generated_content'
\n
"
"输出参数:
\n
"
"- 返回输出字段名,用于后续算子引用
\n
"
"- 在数据框中添加包含转录文本的新列"
)
elif
lang
==
"en"
:
return
(
"This operator transcribes speech content into text. It receives paths or URLs to speech files, "
"uses a large language model for transcription, and saves the transcription results to the dataframe.
\n
"
"Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC interface
\n
"
"- system_prompt: System prompt to define model behavior, default is 'You are a helpful assistant'
\n
"
"- input_key: Field name for input speech file paths or URLs, default is 'raw_content'
\n
"
"- output_key: Field name for output transcription text, default is 'generated_content'
\n\n
"
"Output Parameters:
\n
"
"- Returns output field name for subsequent operator reference
\n
"
"- Adds a new column containing transcription text to the dataframe"
)
else
:
return
(
"SpeechTranscriptor converts speech files to text using a large language model and saves results to a dataframe."
)
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
,
output_key
:
str
=
"generated_content"
):
self
.
input_key
,
self
.
output_key
=
input_key
,
output_key
self
.
logger
.
info
(
"Running Speech Transcriptor..."
)
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
llm_inputs
=
[]
for
index
,
row
in
dataframe
.
iterrows
():
path_or_url
=
row
.
get
(
self
.
input_key
,
''
)
llm_inputs
.
append
(
path_or_url
)
transcriptions
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
llm_inputs
,
system_prompt
=
self
.
system_prompt
)
dataframe
[
self
.
output_key
]
=
transcriptions
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Saving to
{
output_file
}
"
)
self
.
logger
.
info
(
"Speech Transcriptor done"
)
return
output_key
dataflow/operators/core_text/__init__.py
0 → 100644
View file @
97e8278b
from
typing
import
TYPE_CHECKING
if
TYPE_CHECKING
:
from
.generate.prompted_generator
import
PromptedGenerator
from
.generate.prompt_templated_generator
import
PromptTemplatedGenerator
from
.generate.random_domain_knowledge_row_generator
import
RandomDomainKnowledgeRowGenerator
from
.generate.text2qa_generator
import
Text2QAGenerator
from
.generate.text2multihopqa_generator
import
Text2MultiHopQAGenerator
from
.generate.embedding_generator
import
EmbeddingGenerator
from
.generate.retrieval_generator
import
RetrievalGenerator
from
.eval.bench_dataset_evaluator
import
BenchDatasetEvaluator
from
.eval.bench_dataset_evaluator_question
import
BenchDatasetEvaluatorQuestion
from
.eval.text2qa_sample_evaluator
import
Text2QASampleEvaluator
from
.eval.prompted_eval
import
PromptedEvaluator
from
.filter.prompted_filter
import
PromptedFilter
from
.filter.kcentergreedy_filter
import
KCenterGreedyFilter
from
.filter.general_filter
import
GeneralFilter
from
.refine.prompted_refiner
import
PromptedRefiner
from
.refine.pandas_operator
import
PandasOperator
else
:
import
sys
from
dataflow.utils.registry
import
LazyLoader
,
generate_import_structure_from_type_checking
cur_path
=
"dataflow/operators/core_text/"
_import_structure
=
generate_import_structure_from_type_checking
(
__file__
,
cur_path
)
sys
.
modules
[
__name__
]
=
LazyLoader
(
__name__
,
"dataflow/operators/core_text/"
,
_import_structure
)
dataflow/operators/core_text/eval/bench_dataset_evaluator.py
0 → 100644
View file @
97e8278b
from
dataflow.utils.reasoning.AnswerExtraction
import
StringCleaner
,
UnitTextManager
,
AnswerExtractor
from
dataflow.prompts.model_evaluation.general
import
AnswerJudgePrompt
from
dataflow.core.prompt
import
DIYPromptABC
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
LLMServingABC
from
dataflow.core
import
OperatorABC
from
math_verify
import
parse
,
verify
from
dataflow
import
get_logger
from
typing
import
Literal
import
pandas
as
pd
import
numpy
as
np
import
time
import
os
import
re
@
OPERATOR_REGISTRY
.
register
()
class
BenchDatasetEvaluator
(
OperatorABC
):
def
__init__
(
self
,
eval_result_path
:
str
=
None
,
compare_method
:
Literal
[
"match"
,
"semantic"
]
=
"match"
,
system_prompt
:
str
=
"You are a helpful assistant specialized in evaluating answer correctness."
,
llm_serving
:
LLMServingABC
=
None
,
prompt_template
:
DIYPromptABC
=
None
):
if
eval_result_path
is
None
:
timestamp
=
int
(
time
.
time
())
eval_result_path
=
f
"result_bencheval/BenchDatasetEvaluator_result_
{
timestamp
}
.json"
self
.
eval_result_path
=
eval_result_path
self
.
compare_method
=
compare_method
self
.
empty_responses_count
=
0
if
compare_method
==
"match"
:
self
.
compare
=
self
.
math_verify_compare
unit_manager
=
UnitTextManager
()
string_cleaner
=
StringCleaner
(
unit_manager
)
self
.
answer_extractor
=
AnswerExtractor
(
string_cleaner
)
else
:
if
prompt_template
is
None
:
prompt_template
=
AnswerJudgePrompt
()
self
.
prompt_template
=
prompt_template
self
.
system_prompt
=
system_prompt
self
.
llm_serving
=
llm_serving
self
.
logger
=
get_logger
()
def
math_verify_compare
(
self
,
answer
,
ground_truth
):
try
:
return
verify
(
parse
(
str
(
ground_truth
)),
parse
(
str
(
answer
)))
except
:
try
:
return
verify
(
parse
(
ground_truth
),
parse
(
answer
))
except
:
return
False
def
ResolveResponse
(
self
,
response
):
if
response
is
None
or
(
isinstance
(
response
,
str
)
and
response
.
strip
()
==
''
):
self
.
empty_responses_count
+=
1
return
False
try
:
pattern
=
re
.
compile
(
r
'"judgement_result"\s*:\s*(true|false)'
,
re
.
IGNORECASE
)
match
=
pattern
.
search
(
response
)
result_value
=
None
if
match
:
result_value
=
match
.
group
(
1
).
lower
()
else
:
if
"true"
in
response
.
lower
():
result_value
=
"true"
else
:
result_value
=
"false"
return
result_value
==
"true"
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Response format error:
{
response
}
. Error:
{
e
}
"
)
return
False
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:
\n\n
"
"1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题
\n
"
"2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案
\n\n
"
"输入参数:
\n
"
"- input_test_answer_key:预测答案字段名
\n
"
"- input_gt_answer_key:标准答案字段名
\n
"
"- compare_method:比较方法(match/semantic)
\n\n
"
"输出参数:
\n
"
"- answer_match_result:匹配结果(True/False)
\n
"
"- 统计结果将保存到指定的eval_result_path路径
\n
"
)
else
:
return
(
"This operator compares predicted answers against ground truth using two evaluation modes:
\n\n
"
"1. String Matching (match): Mathematical verification for exact answers.
\n
"
"2. Semantic Matching (semantic): LLM-based evaluation comparing predicted vs ground truth answers only.
\n\n
"
"Input Parameters:
\n
"
"- input_test_answer_key: Predicted answer field
\n
"
"- input_gt_answer_key: Ground truth field
\n
"
"- compare_method: Comparison method (match/semantic)
\n\n
"
"Output Parameters:
\n
"
"- answer_match_result: Boolean match result
\n
"
"- Statistics are saved to the specified eval_result_path
\n
"
)
def
check_column
(
self
,
required_columns
:
list
[
str
],
dataframe
:
pd
.
DataFrame
):
for
column
in
required_columns
:
if
column
not
in
dataframe
.
columns
:
self
.
logger
.
error
(
f
"Required column '
{
column
}
' not found in dataframe"
)
return
False
return
True
def
statistic
(
self
,
file_name_prefix
:
str
,
dataframe
:
pd
.
DataFrame
,
compare_method
:
str
):
total_samples
=
len
(
dataframe
)
valid_samples
=
len
(
dataframe
)
-
self
.
empty_responses_count
matched_samples
=
sum
(
dataframe
[
'answer_match_result'
])
accuracy
=
matched_samples
/
valid_samples
if
valid_samples
>
0
else
0
stats
=
{
"bench_name_or_prefix"
:
file_name_prefix
,
"total_samples"
:
total_samples
,
"valid_samples"
:
valid_samples
,
"matched_samples"
:
matched_samples
,
"accuracy"
:
float
(
accuracy
),
"empty_responses_count"
:
self
.
empty_responses_count
,
"compare_method"
:
compare_method
}
stats_df
=
pd
.
DataFrame
([
stats
])
os
.
makedirs
(
os
.
path
.
dirname
(
self
.
eval_result_path
),
exist_ok
=
True
)
stats_df
.
to_json
(
self
.
eval_result_path
,
orient
=
"records"
,
force_ascii
=
False
,
indent
=
2
)
self
.
logger
.
success
(
f
"Statistics saved to
{
self
.
eval_result_path
}
"
)
return
stats_df
def
run
(
self
,
storage
:
DataFlowStorage
,
input_test_answer_key
:
str
=
"generated_cot"
,
input_gt_answer_key
:
str
=
"golden_answer"
)
->
list
:
dataframe
=
storage
.
read
(
"dataframe"
)
dataframe
[
'answer_match_result'
]
=
False
answers
=
dataframe
[
input_test_answer_key
]
ground_truths
=
dataframe
[
input_gt_answer_key
]
if
self
.
compare_method
==
"match"
:
if
not
self
.
check_column
(
required_columns
=
[
input_test_answer_key
,
input_gt_answer_key
],
dataframe
=
dataframe
):
return
[
input_test_answer_key
,
input_gt_answer_key
]
for
i
in
range
(
len
(
answers
)):
final_answer
=
self
.
answer_extractor
.
extract_answer
(
answers
[
i
],
None
)
dataframe
.
at
[
i
,
'answer_match_result'
]
=
self
.
compare
(
final_answer
,
ground_truths
[
i
])
storage
.
write
(
dataframe
)
self
.
statistic
(
storage
.
file_name_prefix
,
dataframe
,
self
.
compare_method
)
return
[
input_test_answer_key
,
input_gt_answer_key
,
'answer_match_result'
]
else
:
if
not
self
.
check_column
(
required_columns
=
[
input_test_answer_key
,
input_gt_answer_key
],
dataframe
=
dataframe
):
return
[
input_test_answer_key
,
input_gt_answer_key
]
empty_reference_mask
=
dataframe
[
input_gt_answer_key
].
isna
()
|
(
dataframe
[
input_gt_answer_key
]
==
''
)
valid_rows
=
dataframe
[
~
empty_reference_mask
]
skipped_count
=
len
(
dataframe
[
empty_reference_mask
])
if
len
(
valid_rows
)
==
0
:
self
.
logger
.
warning
(
"No valid reference answers found. All samples skipped."
)
storage
.
write
(
dataframe
)
return
[
input_test_answer_key
,
input_gt_answer_key
,
'answer_match_result'
]
# 仅用预测答案与标准答案构建Prompt
inputs
=
[
self
.
prompt_template
.
build_prompt
(
answer
=
row
[
input_test_answer_key
],
reference_answer
=
row
[
input_gt_answer_key
]
)
for
_
,
row
in
valid_rows
.
iterrows
()
]
responses
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
inputs
,
system_prompt
=
self
.
system_prompt
)
results
=
[
self
.
ResolveResponse
(
response
)
for
response
in
responses
]
for
i
,
idx
in
enumerate
(
valid_rows
.
index
):
dataframe
.
at
[
idx
,
'answer_match_result'
]
=
results
[
i
]
storage
.
write
(
dataframe
)
self
.
statistic
(
storage
.
file_name_prefix
,
dataframe
,
self
.
compare_method
)
self
.
empty_responses_count
=
0
return
[
input_test_answer_key
,
input_gt_answer_key
,
'answer_match_result'
]
dataflow/operators/core_text/eval/bench_dataset_evaluator_question.py
0 → 100644
View file @
97e8278b
from
email.policy
import
strict
from
dataflow.utils.reasoning.AnswerExtraction
import
StringCleaner
,
UnitTextManager
,
AnswerExtractor
from
dataflow.prompts.model_evaluation.general
import
AnswerJudgePromptQuestion
,
AnswerJudgeMultipleQuestionsPrompt
from
dataflow.core.prompt
import
DIYPromptABC
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
LLMServingABC
from
dataflow.core
import
OperatorABC
from
math_verify
import
parse
,
verify
from
dataflow
import
get_logger
from
typing
import
Literal
import
pandas
as
pd
import
numpy
as
np
import
time
import
os
# 添加os模块导入
import
re
import
json
import
json5
@
OPERATOR_REGISTRY
.
register
()
class
BenchDatasetEvaluatorQuestion
(
OperatorABC
):
def
__init__
(
self
,
eval_result_path
:
str
=
None
,
compare_method
:
Literal
[
"match"
,
"semantic"
]
=
"match"
,
system_prompt
:
str
=
"You are a helpful assistant specialized in evaluating answer correctness."
,
llm_serving
:
LLMServingABC
=
None
,
prompt_template
:
DIYPromptABC
=
None
,
support_subquestions
:
bool
=
False
):
if
eval_result_path
is
None
:
timestamp
=
int
(
time
.
time
())
eval_result_path
=
f
"result_bencheval/BenchDatasetEvaluator_result_
{
timestamp
}
.json"
self
.
eval_result_path
=
eval_result_path
self
.
compare_method
=
compare_method
self
.
empty_responses_count
=
0
# 添加空响应计数器
if
compare_method
==
"match"
:
self
.
compare
=
self
.
math_verify_compare
unit_manager
=
UnitTextManager
()
string_cleaner
=
StringCleaner
(
unit_manager
)
self
.
answer_extractor
=
AnswerExtractor
(
string_cleaner
)
else
:
if
prompt_template
is
None
:
prompt_template
=
AnswerJudgePromptQuestion
()
if
not
support_subquestions
else
AnswerJudgeMultipleQuestionsPrompt
()
self
.
prompt_template
=
prompt_template
self
.
system_prompt
=
system_prompt
self
.
llm_serving
=
llm_serving
self
.
support_subquestions
=
support_subquestions
self
.
logger
=
get_logger
()
def
math_verify_compare
(
self
,
answer
,
ground_truth
):
try
:
return
verify
(
parse
(
str
(
ground_truth
)),
parse
(
str
(
answer
)))
except
:
try
:
return
verify
(
parse
(
ground_truth
),
parse
(
answer
))
except
:
return
False
def
ResolveResponse
(
self
,
response
):
# 检查空响应
if
not
self
.
support_subquestions
:
if
response
is
None
or
(
isinstance
(
response
,
str
)
and
response
.
strip
()
==
''
):
self
.
empty_responses_count
+=
1
return
False
try
:
pattern
=
re
.
compile
(
r
'"judgement_result"\s*:\s*(true|false)'
,
re
.
IGNORECASE
)
match
=
pattern
.
search
(
response
)
result_value
=
None
if
match
:
result_value
=
match
.
group
(
1
).
lower
()
else
:
# 备用解析逻辑,检查响应中是否包含true或false
if
"true"
in
response
.
lower
():
result_value
=
"true"
else
:
result_value
=
"false"
if
result_value
==
"true"
:
return
True
else
:
return
False
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Response format error:
{
response
}
. Error:
{
e
}
"
)
return
False
if
self
.
support_subquestions
:
# 如果支持子问题,假设response是一个列表, 返回正确的数量/总数
correct_num
=
0
total_num
=
0
try
:
response
=
json5
.
loads
(
response
,
strict
=
False
)
# 使用json5解析,允许更宽松的格式
judgement
=
response
.
get
(
"judgement"
,
[])
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Response JSON parse error:
{
response
}
. Error:
{
e
}
"
)
self
.
empty_responses_count
+=
1
return
"0/0"
for
resp
in
judgement
:
if
isinstance
(
resp
,
bool
):
if
resp
is
True
:
correct_num
+=
1
total_num
+=
1
elif
resp
is
False
:
total_num
+=
1
elif
resp
.
lower
()
==
"empty"
:
continue
# 不计入总数
elif
isinstance
(
resp
,
str
):
if
resp
.
lower
()
==
"true"
:
correct_num
+=
1
total_num
+=
1
elif
resp
.
lower
()
==
"false"
:
total_num
+=
1
elif
resp
.
lower
()
==
"empty"
:
continue
# 不计入总数
return
f
"
{
correct_num
}
/
{
total_num
}
"
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:
\n\n
"
"1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题
\n
"
"2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题
\n\n
"
"输入参数:
\n
"
"- input_test_answer_key:预测答案字段名
\n
"
"- input_gt_answer_key:标准答案字段名
\n
"
"- input_question_key:问题字段名(语义匹配模式下必需)
\n
"
"- compare_method:比较方法(match/semantic)
\n\n
"
"输出参数:
\n
"
"- answer_match_result:匹配结果(True/False)
\n
"
"- 统计结果将保存到指定的eval_result_path路径
\n
"
)
elif
lang
==
"en"
:
return
(
"This operator compares predicted answers against ground truth using two evaluation modes:
\n\n
"
"1. String Matching (match): Uses mathematical verification to compare answers, suitable for questions with definitive answers
\n
"
"2. Semantic Matching (semantic): Uses LLM to evaluate semantic similarity, suitable for open-ended questions
\n\n
"
"Input Parameters:
\n
"
"- input_test_answer_key: Predicted answer field
\n
"
"- input_gt_answer_key: Ground truth field
\n
"
"- input_question_key: Question field (required for semantic mode)
\n
"
"- compare_method: Comparison method (match/semantic)
\n\n
"
"Output Parameters:
\n
"
"- answer_match_result: Matching result (True/False)
\n
"
"- Statistics will be saved to the specified eval_result_path
\n
"
)
else
:
return
"BenchEvaluator performs answer validation using string matching or semantic comparison"
def
check_column
(
self
,
required_columns
:
list
[
str
],
dataframe
:
pd
.
DataFrame
):
for
column
in
required_columns
:
if
column
not
in
dataframe
.
columns
:
self
.
logger
.
error
(
f
"Required column '
{
column
}
' not found in dataframe"
)
return
False
return
True
def
statistic
(
self
,
file_name_prefix
:
str
,
dataframe
:
pd
.
DataFrame
,
compare_method
:
Literal
[
"match"
,
"semantic"
]):
total_samples
=
len
(
dataframe
)
valid_samples
=
len
(
dataframe
)
-
self
.
empty_responses_count
matched_samples
=
sum
(
dataframe
[
'answer_match_result'
])
accuracy
=
matched_samples
/
valid_samples
if
valid_samples
>
0
else
0
# 创建统计信息字典
stats
=
{
"bench_name_or_prefix"
:
file_name_prefix
,
"total_samples"
:
total_samples
,
"valid_samples"
:
valid_samples
,
"matched_samples"
:
matched_samples
,
"accuracy"
:
float
(
accuracy
),
# 确保可以被JSON序列化
"empty_responses_count"
:
self
.
empty_responses_count
,
"compare_method"
:
compare_method
}
if
self
.
support_subquestions
:
total_subquestions
=
dataframe
[
'total_subquestions'
].
sum
()
correct_subquestions
=
dataframe
[
'correct_answer_num'
].
sum
()
subquestion_accuracy
=
correct_subquestions
/
total_subquestions
if
total_subquestions
>
0
else
0
stats
.
update
({
"total_subquestions"
:
int
(
total_subquestions
),
"correct_subquestions"
:
int
(
correct_subquestions
),
"subquestion_accuracy"
:
float
(
subquestion_accuracy
)
})
# 将字典转换为DataFrame
stats_df
=
pd
.
DataFrame
([
stats
])
# 直接将统计信息写入到self.eval_result_path
os
.
makedirs
(
os
.
path
.
dirname
(
self
.
eval_result_path
),
exist_ok
=
True
)
stats_df
.
to_json
(
self
.
eval_result_path
,
orient
=
"records"
,
force_ascii
=
False
,
indent
=
2
)
self
.
logger
.
success
(
f
"Statistics saved to
{
self
.
eval_result_path
}
"
)
return
stats_df
def
run
(
self
,
storage
:
DataFlowStorage
,
input_test_answer_key
:
str
=
"generated_cot"
,
input_gt_answer_key
:
str
=
"golden_answer"
,
input_question_key
:
str
=
None
,
)
->
list
:
self
.
test_answer_key
=
input_test_answer_key
self
.
gt_answer_key
=
input_gt_answer_key
self
.
question_key
=
input_question_key
dataframe
=
storage
.
read
(
"dataframe"
)
dataframe
[
'answer_match_result'
]
=
False
answers
=
dataframe
[
self
.
test_answer_key
]
ground_truths
=
dataframe
[
self
.
gt_answer_key
]
if
self
.
compare_method
==
"match"
:
if
self
.
check_column
(
required_columns
=
[
input_test_answer_key
,
input_gt_answer_key
],
dataframe
=
dataframe
)
is
False
:
return
required_columns
for
i
in
range
(
len
(
answers
)):
final_answer
=
self
.
answer_extractor
.
extract_answer
(
answers
[
i
],
None
)
if
self
.
compare
(
final_answer
,
ground_truths
[
i
]):
dataframe
.
at
[
i
,
'answer_match_result'
]
=
True
else
:
dataframe
.
at
[
i
,
'answer_match_result'
]
=
False
output_file
=
storage
.
write
(
dataframe
)
# 生成统计信息并直接写入JSON文件
stats
=
self
.
statistic
(
storage
.
file_name_prefix
,
dataframe
,
self
.
compare_method
)
return
[
self
.
test_answer_key
,
self
.
gt_answer_key
,
'answer_match_result'
]
else
:
if
self
.
check_column
(
required_columns
=
[
input_test_answer_key
,
input_gt_answer_key
,
input_question_key
],
dataframe
=
dataframe
)
is
False
:
return
required_columns
empty_reference_mask
=
dataframe
[
input_gt_answer_key
].
isna
()
|
(
dataframe
[
input_gt_answer_key
]
==
''
)
skipped_rows
=
dataframe
[
empty_reference_mask
]
valid_rows
=
dataframe
[
~
empty_reference_mask
]
skipped_count
=
len
(
skipped_rows
)
if
len
(
valid_rows
)
==
0
:
self
.
logger
.
warning
(
"No valid samples with reference answers found. All samples skipped."
)
if
self
.
keep_all_samples
:
output_file
=
storage
.
write
(
dataframe
)
# 保留所有行,但answer_match_result都为False
else
:
output_file
=
storage
.
write
(
pd
.
DataFrame
(
columns
=
dataframe
.
columns
))
# 不保留任何行
self
.
logger
.
info
(
f
"Dataframe saved to
{
output_file
}
. Skipped
{
skipped_count
}
samples due to missing reference answers."
)
return
required_columns
+
[
'answer_match_result'
]
# 只对有参考答案的行构建提示词并调用LLM
inputs
=
[
self
.
prompt_template
.
build_prompt
(
question
=
row
[
input_question_key
],
answer
=
row
[
input_test_answer_key
],
reference_answer
=
row
[
input_gt_answer_key
]
)
for
_
,
row
in
valid_rows
.
iterrows
()]
responses
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
inputs
,
system_prompt
=
self
.
system_prompt
)
# if self.support_subquestions:
# # 每个response是一个列表,连接一个长列表,比如[["true", "false"], ["true"]] -> ["true", "false", "true"]
# responses = [item for sublist in responses for item in sublist]
results
=
[
self
.
ResolveResponse
(
response
)
for
response
in
responses
]
# 创建结果掩码,与valid_rows长度相同
result_mask
=
np
.
array
(
results
,
dtype
=
bool
)
# 更新有效行的answer_match_result
valid_indices
=
valid_rows
.
index
if
not
self
.
support_subquestions
:
for
i
,
idx
in
enumerate
(
valid_indices
):
dataframe
.
at
[
idx
,
'answer_match_result'
]
=
results
[
i
]
else
:
for
i
,
idx
in
enumerate
(
valid_indices
):
correct_answer_num
=
int
(
results
[
i
].
split
(
'/'
)[
0
])
total_subquestions
=
int
(
results
[
i
].
split
(
'/'
)[
1
])
dataframe
.
at
[
idx
,
'correct_answer_num'
]
=
correct_answer_num
dataframe
.
at
[
idx
,
'total_subquestions'
]
=
total_subquestions
dataframe
.
at
[
idx
,
'answer_match_result'
]
=
(
correct_answer_num
==
total_subquestions
)
and
(
total_subquestions
>
0
)
# 全对为True,否则为False
dataframe
.
at
[
idx
,
'response_evaluation'
]
=
responses
[
i
]
# 保存LLM的原始响应内容
output_file
=
storage
.
write
(
dataframe
)
# 生成统计信息并直接写入JSON文件
stats
=
self
.
statistic
(
storage
.
file_name_prefix
,
dataframe
,
self
.
compare_method
)
# 重置空响应计数器
self
.
empty_responses_count
=
0
return
[
input_test_answer_key
,
input_gt_answer_key
,
input_question_key
,
'answer_match_result'
]
\ No newline at end of file
dataflow/operators/core_text/eval/prompted_eval.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
import
re
@
OPERATOR_REGISTRY
.
register
()
class
PromptedEvaluator
(
OperatorABC
):
'''
Answer Generator is a class that generates answers for given questions.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
system_prompt
:
str
=
"Please evaluate the quality of this data on a scale from 1 to 5."
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
system_prompt
=
system_prompt
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 "
"storage 持久化)。模型应只输出分数(整数)。
\n
"
"功能:对每行输入文本生成一个评分。
\n
"
"输入参数:
\n
"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。
\n
"
"- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。
\n
"
"- input_key:输入文本所在列名(默认:'raw_content')。
\n
"
"- output_key:评分结果写入的列名(默认:'eval')。
\n
"
"输出:
\n
"
"- 返回输出列名(用于后续算子引用),评分结果已写回并保存。"
)
elif
lang
==
"en"
:
return
(
"PromptedEvaluator: uses an LLM to rate data quality and writes the score back to the "
"DataFrame (persisted via storage). The model is expected to output only the integer score.
\n
"
"Purpose: for each input row, produce an score.
\n
"
"Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC.
\n
"
"- system_prompt: system prompt (default: 'Please evaluate the quality of this data on a scale from 1 to 5.').
\n
"
"- input_key: column name containing input text (default: 'raw_content').
\n
"
"- output_key: column name to store scores (default: 'eval').
\n
"
"Output:
\n
"
"- Returns the output column name for downstream operators; the scored DataFrame is saved."
)
else
:
return
"PromptedEvaluator rates data quality (1–5) from input text and stores the integer score."
def
_parse_scores
(
self
,
outputs
:
list
[
str
])
->
list
[
int
]:
"""
将模型输出的分数字符串转为整数。
- 成功提取到 1–5 范围内的分数 → 返回该分数
- 提取失败或不合法 → 返回 0
"""
results
=
[]
for
out
in
outputs
:
score
=
0
try
:
if
out
is
None
:
results
.
append
(
0
)
continue
text
=
str
(
out
).
strip
()
# 用正则找第一个数字
match
=
re
.
search
(
r
"\d+"
,
text
)
if
match
:
val
=
int
(
match
.
group
())
if
1
<=
val
<=
5
:
score
=
val
# 否则默认 0
except
Exception
:
score
=
0
results
.
append
(
score
)
return
results
def
eval
(
self
,
dataframe
,
input_key
):
llm_inputs
=
[]
for
index
,
row
in
dataframe
.
iterrows
():
raw_content
=
row
.
get
(
input_key
,
''
)
if
raw_content
:
llm_input
=
self
.
system_prompt
+
str
(
raw_content
)
+
'Please only output the score!'
llm_inputs
.
append
(
llm_input
)
try
:
self
.
logger
.
info
(
"Generating text using the model..."
)
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
)
scores
=
self
.
_parse_scores
(
generated_outputs
)
self
.
logger
.
info
(
"Text generation completed."
)
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error during text generation:
{
e
}
"
)
return
return
scores
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
,
output_key
:
str
=
"eval"
):
self
.
logger
.
info
(
"Running PromptGenerator..."
)
# Load the raw dataframe from the input file
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
# Create a list to hold all generated questions and answers
generated_outputs
=
self
.
eval
(
dataframe
,
input_key
)
# Add the generated content back to the dataframe
dataframe
[
output_key
]
=
generated_outputs
# Save the updated dataframe to the output file
output_file
=
storage
.
write
(
dataframe
)
return
output_key
dataflow/operators/core_text/eval/text2qa_sample_evaluator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
import
re
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.core.prompt
import
prompt_restrict
from
dataflow.prompts.text2qa
import
(
Text2QAQuestionQualityPrompt
,
Text2QAAnswerAlignmentPrompt
,
Text2QAAnswerVerifiabilityPrompt
,
Text2QADownstreamValuePrompt
)
@
prompt_restrict
(
Text2QAQuestionQualityPrompt
,
Text2QAAnswerAlignmentPrompt
,
Text2QAAnswerVerifiabilityPrompt
,
Text2QADownstreamValuePrompt
)
@
OPERATOR_REGISTRY
.
register
()
class
Text2QASampleEvaluator
(
OperatorABC
):
'''
Answer Generator is a class that generates answers for given questions.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
# prompt_template = None # prompt is fix
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于为给的的文档片段生成种子QA对打分
\n\n
"
"输入参数:
\n
"
"- input_question_key: Field name containing the generated question
\n
"
"- input_answer_key: Field name containing the generated answer
\n
"
"- output_question_quality_key: Field name containing the question quality grade
\n
"
"- output_question_quality_feedback_key: Field name containing the question quality feedback
\n
"
"- output_answer_alignment_key: Field name containing the answer alignment grade
\n
"
"- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback
\n
"
"- output_answer_verifiability_key: Field name containing the answer verifiability grade
\n
"
"- output_downstream_value_key: Field name containing the downstream value grade
\n
"
"- output_downstream_value_feedback_key: Field name containing the downstream value feedback
\n
"
)
elif
lang
==
"en"
:
return
(
"This operator generates prompts for given document fragments to generate seed QA pairs.
\n\n
"
"Input Parameters:
\n
"
"- input_question_key: Field name containing the generated question
\n
"
"- input_answer_key: Field name containing the generated answer
\n
"
"- output_question_quality_key: Field name containing the question quality grade
\n
"
"- output_question_quality_feedback_key: Field name containing the question quality feedback
\n
"
"- output_answer_alignment_key: Field name containing the answer alignment grade
\n
"
"- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback
\n
"
"- output_answer_verifiability_key: Field name containing the answer verifiability grade
\n
"
"- output_downstream_value_key: Field name containing the downstream value grade
\n
"
"- output_downstream_value_feedback_key: Field name containing the downstream value feedback
\n
"
)
else
:
return
"QAScorer scores QA pairs for given document fragments."
def
_validate_dataframe
(
self
,
dataframe
:
pd
.
DataFrame
):
required_keys
=
[
self
.
input_question_key
,
self
.
input_answer_key
]
forbidden_keys
=
[
self
.
output_question_quality_key
,
self
.
output_question_quality_feedback_key
,
self
.
output_answer_alignment_key
,
self
.
output_answer_alignment_feedback_key
,
self
.
output_answer_verifiability_key
,
self
.
output_answer_verifiability_feedback_key
,
self
.
output_downstream_value_key
,
self
.
output_downstream_value_feedback_key
]
missing
=
[
k
for
k
in
required_keys
if
k
not
in
dataframe
.
columns
]
conflict
=
[
k
for
k
in
forbidden_keys
if
k
in
dataframe
.
columns
]
if
missing
:
raise
ValueError
(
f
"Missing required column(s):
{
missing
}
"
)
if
conflict
:
raise
ValueError
(
f
"The following column(s) already exist and would be overwritten:
{
conflict
}
"
)
def
_build_prompts
(
self
,
dataframe
):
"""
Reformat the prompts in the dataframe to generate questions.
"""
question_quality_inputs
=
[]
self
.
prompts
=
Text2QAQuestionQualityPrompt
()
question_quality_prompt
=
self
.
prompts
.
build_prompt
()
answer_alignment_inputs
=
[]
self
.
prompts
=
Text2QAAnswerAlignmentPrompt
()
answer_alignment_prompt
=
self
.
prompts
.
build_prompt
()
answer_verifiability_inputs
=
[]
self
.
prompts
=
Text2QAAnswerVerifiabilityPrompt
()
answer_verifiability_prompt
=
self
.
prompts
.
build_prompt
()
downstream_value_inputs
=
[]
self
.
prompts
=
Text2QADownstreamValuePrompt
()
downstream_value_prompt
=
self
.
prompts
.
build_prompt
()
for
index
,
row
in
dataframe
.
iterrows
():
question_quality_content
=
question_quality_prompt
+
"Question: "
+
row
[
self
.
input_question_key
]
+
"
\n
"
+
"Answer: "
+
row
[
self
.
input_answer_key
]
question_quality_inputs
.
append
(
question_quality_content
)
answer_alignment_content
=
answer_alignment_prompt
+
"Question: "
+
row
[
self
.
input_question_key
]
+
"
\n
"
+
"Answer: "
+
row
[
self
.
input_answer_key
]
answer_alignment_inputs
.
append
(
answer_alignment_content
)
answer_verifiability_content
=
answer_verifiability_prompt
+
"Question: "
+
row
[
self
.
input_question_key
]
+
"
\n
"
+
"Answer: "
+
row
[
self
.
input_answer_key
]
answer_verifiability_inputs
.
append
(
answer_verifiability_content
)
downstream_value_content
=
downstream_value_prompt
+
"Question: "
+
row
[
self
.
input_question_key
]
+
"
\n
"
+
"Answer: "
+
row
[
self
.
input_answer_key
]
downstream_value_inputs
.
append
(
downstream_value_content
)
return
question_quality_inputs
,
answer_alignment_inputs
,
answer_verifiability_inputs
,
downstream_value_inputs
def
_parse_grade_and_feedback
(
self
,
response
:
str
)
->
tuple
:
grading_match
=
re
.
search
(
r
"\*\*Grading\*\*:\s*(\d+)"
,
response
)
feedback_match
=
re
.
search
(
r
"\*\*Feedback\*\*:\s*(.+)"
,
response
,
re
.
DOTALL
)
grading
=
float
(
grading_match
.
group
(
1
))
if
grading_match
else
0
feedback
=
feedback_match
.
group
(
1
).
strip
()
if
feedback_match
else
''
return
grading
,
feedback
def
run
(
self
,
storage
:
DataFlowStorage
,
input_question_key
:
str
=
"generated_question"
,
input_answer_key
:
str
=
"generated_answer"
,
output_question_quality_key
:
str
=
"question_quality_grades"
,
output_question_quality_feedback_key
:
str
=
"question_quality_feedbacks"
,
output_answer_alignment_key
:
str
=
"answer_alignment_grades"
,
output_answer_alignment_feedback_key
:
str
=
"answer_alignment_feedbacks"
,
output_answer_verifiability_key
:
str
=
"answer_verifiability_grades"
,
output_answer_verifiability_feedback_key
:
str
=
"answer_verifiability_feedbacks"
,
output_downstream_value_key
:
str
=
"downstream_value_grades"
,
output_downstream_value_feedback_key
:
str
=
"downstream_value_feedbacks"
):
self
.
input_question_key
,
self
.
input_answer_key
,
self
.
output_question_quality_key
,
self
.
output_question_quality_feedback_key
,
self
.
output_answer_alignment_key
,
self
.
output_answer_alignment_feedback_key
,
self
.
output_answer_verifiability_key
,
self
.
output_answer_verifiability_feedback_key
,
self
.
output_downstream_value_key
,
self
.
output_downstream_value_feedback_key
=
input_question_key
,
input_answer_key
,
output_question_quality_key
,
output_question_quality_feedback_key
,
output_answer_alignment_key
,
output_answer_alignment_feedback_key
,
output_answer_verifiability_key
,
output_answer_verifiability_feedback_key
,
output_downstream_value_key
,
output_downstream_value_feedback_key
dataframe
=
storage
.
read
(
"dataframe"
)
self
.
_validate_dataframe
(
dataframe
)
# 构建prompt
q_inputs
,
a_inputs
,
v_inputs
,
d_inputs
=
self
.
_build_prompts
(
dataframe
)
# 生成四类分数和反馈
self
.
logger
.
info
(
"Scoring question quality..."
)
q_scores
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
q_inputs
,
system_prompt
=
""
)
q_grades
,
q_feedbacks
=
zip
(
*
[
self
.
_parse_grade_and_feedback
(
r
)
for
r
in
q_scores
])
self
.
logger
.
info
(
"Scoring answer alignment..."
)
a_scores
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
a_inputs
,
system_prompt
=
""
)
a_grades
,
a_feedbacks
=
zip
(
*
[
self
.
_parse_grade_and_feedback
(
r
)
for
r
in
a_scores
])
self
.
logger
.
info
(
"Scoring answer verifiability..."
)
v_scores
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
v_inputs
,
system_prompt
=
""
)
v_grades
,
v_feedbacks
=
zip
(
*
[
self
.
_parse_grade_and_feedback
(
r
)
for
r
in
v_scores
])
self
.
logger
.
info
(
"Scoring downstream value..."
)
d_scores
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
d_inputs
,
system_prompt
=
""
)
d_grades
,
d_feedbacks
=
zip
(
*
[
self
.
_parse_grade_and_feedback
(
r
)
for
r
in
d_scores
])
# 写回结果
dataframe
[
self
.
output_question_quality_key
]
=
q_grades
dataframe
[
self
.
output_question_quality_feedback_key
]
=
q_feedbacks
dataframe
[
self
.
output_answer_alignment_key
]
=
a_grades
dataframe
[
self
.
output_answer_alignment_feedback_key
]
=
a_feedbacks
dataframe
[
self
.
output_answer_verifiability_key
]
=
v_grades
dataframe
[
self
.
output_answer_verifiability_feedback_key
]
=
v_feedbacks
dataframe
[
self
.
output_downstream_value_key
]
=
d_grades
dataframe
[
self
.
output_downstream_value_feedback_key
]
=
d_feedbacks
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Results saved to
{
output_file
}
"
)
return
[
output_question_quality_key
,
output_question_quality_feedback_key
,
output_answer_alignment_key
,
output_answer_alignment_feedback_key
,
output_answer_verifiability_key
,
output_answer_verifiability_feedback_key
,
output_downstream_value_key
,
output_downstream_value_feedback_key
]
\ No newline at end of file
dataflow/operators/core_text/filter/general_filter.py
0 → 100644
View file @
97e8278b
from
dataflow
import
get_logger
from
dataflow.core
import
OperatorABC
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
import
pandas
as
pd
@
OPERATOR_REGISTRY
.
register
()
class
GeneralFilter
(
OperatorABC
):
def
__init__
(
self
,
filter_rules
:
list
):
self
.
logger
=
get_logger
()
self
.
filter_rules
=
filter_rules
self
.
logger
.
info
(
f
"Initializing
{
self
.
__class__
.
__name__
}
with rules:
{
self
.
filter_rules
}
"
)
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。
\n\n
"
"每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,"
"用于指定保留哪些行。
\n\n
"
"输入参数:
\n
"
"- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,"
"需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。
\n\n
"
"示例:
\n
"
" - lambda df: df['score'] > 0.5
\n
"
" - lambda df: df['label'].isin(['A', 'B'])"
)
elif
lang
==
"en"
:
return
(
"This operator applies custom filtering functions to a DataFrame.
\n\n
"
"Each filter rule is a function (e.g., lambda expression) that takes a DataFrame "
"and returns a boolean Series indicating which rows to retain.
\n\n
"
"Input Parameters:
\n
"
"- filter_rules: A list of functions, each in the form of lambda df: ..., "
"returning a boolean Series of the same length as the DataFrame. "
"All rules are combined using logical AND.
\n\n
"
"Examples:
\n
"
" - lambda df: df['score'] > 0.5
\n
"
" - lambda df: df['label'].isin(['A', 'B'])"
)
else
:
return
"GeneralFilter filters DataFrame rows using a list of functions returning boolean Series."
def
_validate_dataframe
(
self
,
dataframe
:
pd
.
DataFrame
):
required_keys
=
[
self
.
input_key
]
forbidden_keys
=
[]
missing
=
[
k
for
k
in
required_keys
if
k
not
in
dataframe
.
columns
]
conflict
=
[
k
for
k
in
forbidden_keys
if
k
in
dataframe
.
columns
]
if
missing
:
raise
ValueError
(
f
"Missing required column(s):
{
missing
}
"
)
if
conflict
:
raise
ValueError
(
f
"The following column(s) already exist and would be overwritten:
{
conflict
}
"
)
def
run
(
self
,
storage
:
DataFlowStorage
,
):
df
=
storage
.
read
(
"dataframe"
)
mask
=
pd
.
Series
(
True
,
index
=
df
.
index
)
for
rule_fn
in
self
.
filter_rules
:
if
not
callable
(
rule_fn
):
raise
ValueError
(
"Each filter rule must be a callable(e.g., lambda df: ...)"
)
cond
=
rule_fn
(
df
)
if
not
isinstance
(
cond
,
pd
.
Series
)
or
cond
.
dtype
!=
bool
:
raise
ValueError
(
"Each filter function must return a boolean Series"
)
mask
&=
cond
filtered_df
=
df
[
mask
]
self
.
logger
.
info
(
f
"Filtering complete. Remaining rows:
{
len
(
filtered_df
)
}
"
)
storage
.
write
(
filtered_df
)
self
.
logger
.
info
(
f
"Filtering completed. Total records passing filter:
{
len
(
filtered_df
)
}
."
)
return
""
dataflow/operators/core_text/filter/kcentergreedy_filter.py
0 → 100644
View file @
97e8278b
import
numpy
as
np
import
pandas
as
pd
import
random
import
torch
from
torch
import
Tensor
from
typing
import
List
,
Optional
import
torch.nn.functional
as
F
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
,
LLMServingABC
class
KCenterGreedy
:
"""Implements k-center-greedy method.
Args:
embedding (Tensor): Embedding vector extracted from a LLM
sampling_ratio (float): Ratio to choose coreset size from the embedding size.
Example:
>>> embedding.shape
torch.Size([219520, 1536])
>>> sampler = KCenterGreedy(embedding=embedding)
>>> sampled_idxs = sampler.select_coreset_idxs()
>>> coreset = embedding[sampled_idxs]
>>> coreset.shape
torch.Size([219, 1536])
"""
def
__init__
(
self
,
embedding
:
Tensor
,
sampling_ratio
:
float
)
->
None
:
self
.
embedding
=
embedding
self
.
coreset_size
=
int
(
embedding
.
shape
[
0
]
*
sampling_ratio
)
# self.model = SparseRandomProjection(eps=0.9)
self
.
features
:
Tensor
self
.
min_distances
:
Tensor
=
torch
.
tensor
([])
self
.
n_observations
=
self
.
embedding
.
shape
[
0
]
def
reset_distances
(
self
)
->
None
:
"""Reset minimum distances."""
self
.
min_distances
=
torch
.
tensor
([])
def
update_distances
(
self
,
cluster_centers
:
List
[
int
])
->
None
:
"""Update min distances given cluster centers.
Args:
cluster_centers (List[int]): indices of cluster centers
"""
if
cluster_centers
:
centers
=
self
.
features
[
cluster_centers
]
distance
=
F
.
pairwise_distance
(
self
.
features
,
centers
,
p
=
2
).
reshape
(
-
1
,
1
)
if
self
.
min_distances
.
shape
[
0
]
==
0
:
self
.
min_distances
=
distance
else
:
self
.
min_distances
=
torch
.
minimum
(
self
.
min_distances
,
distance
)
def
get_new_idx
(
self
)
->
int
:
"""Get index value of a sample.
Based on minimum distance of the cluster
Returns:
int: Sample index
"""
if
isinstance
(
self
.
min_distances
,
Tensor
):
idx
=
int
(
torch
.
argmax
(
self
.
min_distances
).
item
())
else
:
raise
ValueError
(
f
"self.min_distances must be of type Tensor. Got
{
type
(
self
.
min_distances
)
}
"
)
return
idx
def
select_coreset_idxs
(
self
,
selected_idxs
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""Greedily form a coreset to minimize the maximum distance of a cluster.
Args:
selected_idxs: index of samples already selected. Defaults to an empty set.
Returns:
indices of samples selected to minimize distance to cluster centers
"""
if
selected_idxs
is
None
:
selected_idxs
=
[]
if
self
.
embedding
.
ndim
==
2
:
# self.model.fit(self.embedding)
# self.features = self.model.transform(self.embedding)
self
.
features
=
self
.
embedding
self
.
reset_distances
()
else
:
self
.
features
=
self
.
embedding
.
reshape
(
self
.
embedding
.
shape
[
0
],
-
1
)
self
.
update_distances
(
cluster_centers
=
selected_idxs
)
selected_coreset_idxs
:
List
[
int
]
=
[]
idx
=
int
(
torch
.
randint
(
high
=
self
.
n_observations
,
size
=
(
1
,)).
item
())
cnt
=
0
for
_
in
range
(
self
.
coreset_size
):
cnt
+=
1
if
(
cnt
%
1000
==
0
):
print
(
cnt
)
self
.
update_distances
(
cluster_centers
=
[
idx
])
idx
=
self
.
get_new_idx
()
if
idx
in
selected_idxs
:
raise
ValueError
(
"New indices should not be in selected indices."
)
self
.
min_distances
[
idx
]
=
0
selected_coreset_idxs
.
append
(
idx
)
return
selected_coreset_idxs
def
sample_coreset
(
self
,
selected_idxs
:
Optional
[
List
[
int
]]
=
None
)
->
Tensor
:
"""Select coreset from the embedding.
Args:
selected_idxs: index of samples already selected. Defaults to an empty set.
Returns:
Tensor: Output coreset
Example:
>>> embedding.shape
torch.Size([219520, 1536])
>>> sampler = KCenterGreedy(...)
>>> coreset = sampler.sample_coreset()
>>> coreset.shape
torch.Size([219, 1536])
"""
idxs
=
self
.
select_coreset_idxs
(
selected_idxs
)
coreset
=
self
.
embedding
[
idxs
]
return
coreset
@
OPERATOR_REGISTRY
.
register
()
class
KCenterGreedyFilter
(
OperatorABC
):
def
__init__
(
self
,
num_samples
:
int
,
embedding_serving
:
LLMServingABC
=
None
):
self
.
num_samples
=
num_samples
self
.
embedding_serving
=
embedding_serving
self
.
logger
=
get_logger
()
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对
\n\n
"
"输入参数:
\n
"
"- input_key: 包含文档片段的字段名
\n
"
"- embedding_model_path: 嵌入模型路径
\n
"
"- num_samples: 选取的文档片段数量
\n
"
"- method: 选择方法,随机或k-center-greedy
\n\n
"
)
elif
lang
==
"en"
:
return
(
"This operator chooses document fragments for seed QA pairs.
\n\n
"
"Input Parameters:
\n
"
"- input_key: Field name containing the content
\n
"
"- embedding_serving: Embedding serving
\n
"
"- num_samples: Number of document fragments to select
\n
"
"- method: Selection method, random or k-center-greedy
\n\n
"
"Output Parameters:
\n
"
"- Returns 1 for valid content, 0 otherwise"
)
else
:
return
"ContentChooser chooses document fragments for seed QA pairs"
def
_validate_dataframe
(
self
,
dataframe
:
pd
.
DataFrame
):
required_keys
=
[
self
.
input_key
]
forbidden_keys
=
[]
missing
=
[
k
for
k
in
required_keys
if
k
not
in
dataframe
.
columns
]
conflict
=
[
k
for
k
in
forbidden_keys
if
k
in
dataframe
.
columns
]
if
missing
:
self
.
logger
.
error
(
f
"Missing required column(s):
{
missing
}
"
)
if
conflict
:
self
.
logger
.
error
(
f
"The following column(s) already exist and would be overwritten:
{
conflict
}
"
)
missing_keys
=
[
key
for
key
in
required_keys
if
key
not
in
dataframe
.
columns
]
if
missing_keys
:
self
.
logger
.
error
(
f
"The following required columns are missing from the dataframe:
{
missing_keys
}
"
)
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"content"
,
)
->
list
:
'''
Execute the answer format filter process
'''
self
.
input_key
=
input_key
dataframe
=
storage
.
read
(
"dataframe"
)
self
.
_validate_dataframe
(
dataframe
)
texts
=
dataframe
[
self
.
input_key
].
tolist
()
indexes
=
np
.
zeros
(
len
(
dataframe
)).
astype
(
int
)
embeddings_list
=
self
.
embedding_serving
.
generate_embedding_from_input
(
texts
)
embeddings
=
torch
.
tensor
(
embeddings_list
)
sampler
=
KCenterGreedy
(
embedding
=
embeddings
,
sampling_ratio
=
self
.
num_samples
/
len
(
texts
))
chooss_indexes
=
sampler
.
select_coreset_idxs
()
for
index
in
chooss_indexes
:
indexes
[
index
]
=
1
dataframe
=
dataframe
[
np
.
array
(
indexes
)
==
1
]
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Results saved to
{
output_file
}
"
)
return
[
self
.
input_key
,]
\ No newline at end of file
dataflow/operators/core_text/filter/prompted_filter.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.operators.core_text
import
PromptedEvaluator
@
OPERATOR_REGISTRY
.
register
()
class
PromptedFilter
(
OperatorABC
):
'''
Answer Generator is a class that generates answers for given questions.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
system_prompt
:
str
=
"Please evaluate the quality of this data on a scale from 1 to 5."
,
min_score
=
1
,
max_score
=
5
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
prompted_evaluator
=
PromptedEvaluator
(
llm_serving
,
system_prompt
)
self
.
min_score
=
min_score
self
.
max_score
=
max_score
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,"
"并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。"
"默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。
\n
"
"
\n
输入参数:
\n
"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口
\n
"
"- system_prompt:系统提示词,定义评估规范(可选,默认 "
"'Please evaluate the quality of this data on a scale from 1 to 5.')
\n
"
"- input_key:待评估文本所在列名(默认 'raw_content')
\n
"
"- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)
\n
"
"- min_score:筛选的最小分(默认 5)
\n
"
"- max_score:筛选的最大分(默认 5)
\n
"
"
\n
输出参数:
\n
"
"- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)
\n
"
"- 返回 output_key 以供后续算子引用
\n
"
"
\n
备注:
\n
"
"- 默认打分区间是 1–5,但可根据实际 prompt 改变。"
)
elif
lang
==
"en"
:
return
(
"PromptedFilter leverages PromptedEvaluator to assign numeric scores to input data, "
"and filters rows whose scores fall within [min_score, max_score] (inclusive). "
"By default, the scoring scale is 1–5, but this can be customized through system_prompt.
\n
"
"
\n
Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC
\n
"
"- system_prompt: System prompt defining the evaluation criteria "
"(default: 'Please evaluate the quality of this data on a scale from 1 to 5.')
\n
"
"- input_key: Column name containing the text to evaluate (default 'raw_content')
\n
"
"- output_key: Column name to store the score (default 'eval'; overwritten if it exists)
\n
"
"- min_score: Minimum score for filtering (default 5)
\n
"
"- max_score: Maximum score for filtering (default 5)
\n
"
"
\n
Output:
\n
"
"- Filtered DataFrame (rows with scores in [min_score, max_score])
\n
"
"- Returns output_key for downstream operators
\n
"
"
\n
Note:
\n
"
"- Default scoring range is 1–5, but can vary depending on the system_prompt."
)
else
:
return
"PromptedFilter scores rows via PromptedEvaluator and filters by a configurable score range (default 1–5)."
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
,
output_key
:
str
=
"eval"
):
self
.
logger
.
info
(
"Running PromptGenerator..."
)
# Load the raw dataframe from the input file
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
# Create a list to hold all generated questions and answers
generated_outputs
=
self
.
prompted_evaluator
.
eval
(
dataframe
,
input_key
)
# Add the generated content back to the dataframe
dataframe
[
output_key
]
=
generated_outputs
filtered_dataframe
=
dataframe
[(
dataframe
[
output_key
]
>=
self
.
min_score
)
&
(
dataframe
[
output_key
]
<=
self
.
max_score
)]
# Save the updated dataframe to the output file
output_file
=
storage
.
write
(
filtered_dataframe
)
return
output_key
dataflow/operators/core_text/generate/embedding_generator.py
0 → 100644
View file @
97e8278b
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
@
OPERATOR_REGISTRY
.
register
()
class
EmbeddingGenerator
(
OperatorABC
):
'''
Embedding Generator is a class that generates answers for given input text.
'''
def
__init__
(
self
,
embedding_serving
:
LLMServingABC
,
):
self
.
logger
=
get_logger
()
self
.
embedding_serving
=
embedding_serving
self
.
logger
.
info
(
f
"Initializing
{
self
.
__class__
.
__name__
}
..."
)
self
.
logger
.
info
(
f
"
{
self
.
__class__
.
__name__
}
initialized."
)
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),"
"通常用于语义检索、聚类或下游模型输入等任务。
\n\n
"
"输入参数:
\n
"
"- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示
\n
"
"- input_key:输入文本字段名,默认为'text'
\n
"
"- output_key:输出向量字段名,默认为'embeddings'
\n\n
"
"输出参数:
\n
"
"- 包含文本向量的DataFrame,每行对应一个输入文本的embedding
\n
"
"- 返回输出字段名(如'embeddings'),可供后续算子引用"
)
elif
lang
==
"en"
:
return
(
"The EmbeddingGenerator operator generates vector representations (embeddings) "
"from input text, typically used for semantic retrieval, clustering, or downstream model inputs.
\n\n
"
"Input Parameters:
\n
"
"- embedding_serving: Embedding service object implementing the LLMServingABC interface for generating text embeddings
\n
"
"- input_key: Field name for input text, default is 'text'
\n
"
"- output_key: Field name for output embeddings, default is 'embeddings'
\n\n
"
"Output Parameters:
\n
"
"- DataFrame containing text embeddings, where each row corresponds to one input text
\n
"
"- Returns the output field name (e.g., 'embeddings') for subsequent operator reference"
)
else
:
return
(
"EmbeddingGenerator generates vector embeddings from text input for retrieval or representation learning tasks."
)
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"text"
,
output_key
:
str
=
"embeddings"
,
):
dataframe
=
storage
.
read
(
"dataframe"
)
self
.
input_key
=
input_key
self
.
output_key
=
output_key
texts
=
dataframe
[
self
.
input_key
].
tolist
()
embeddings_list
=
self
.
embedding_serving
.
generate_embedding_from_input
(
texts
)
# embeddings = torch.tensor(embeddings_list)
dataframe
[
self
.
output_key
]
=
embeddings_list
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Results saved to
{
output_file
}
"
)
return
[
self
.
output_key
]
dataflow/operators/core_text/generate/prompt_templated_generator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
import
string
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.core.prompt
import
prompt_restrict
,
PromptABC
,
DIYPromptABC
from
typing
import
Union
,
Any
,
Set
from
dataflow.prompts.core_text
import
StrFormatPrompt
@
prompt_restrict
(
StrFormatPrompt
)
@
OPERATOR_REGISTRY
.
register
()
class
PromptTemplatedGenerator
(
OperatorABC
):
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
prompt_template
:
Union
[
StrFormatPrompt
,
DIYPromptABC
]
=
None
,
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
prompt_template
=
prompt_template
if
prompt_template
is
None
:
raise
ValueError
(
"prompt_template cannot be None"
)
def
run
(
self
,
storage
:
DataFlowStorage
,
output_key
:
str
=
"generated_content"
,
**
input_keys
:
Any
):
self
.
storage
:
DataFlowStorage
=
storage
self
.
output_key
=
output_key
self
.
logger
.
info
(
"Running PromptTemplatedGenerator..."
)
self
.
input_keys
=
input_keys
need_fields
=
set
(
input_keys
.
keys
())
# Load the raw dataframe from the input file
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
llm_inputs
=
[]
for
idx
,
row
in
dataframe
.
iterrows
():
key_dict
=
{}
for
key
in
need_fields
:
key_dict
[
key
]
=
row
[
input_keys
[
key
]]
prompt_text
=
self
.
prompt_template
.
build_prompt
(
need_fields
,
**
key_dict
)
llm_inputs
.
append
(
prompt_text
)
self
.
logger
.
info
(
f
"Prepared
{
len
(
llm_inputs
)
}
prompts for LLM generation."
)
# Create a list to hold all generated contents
# Generate content using the LLM serving
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
)
dataframe
[
self
.
output_key
]
=
generated_outputs
output_file
=
self
.
storage
.
write
(
dataframe
)
return
output_key
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"基于模板化提示词(Prompt Template)生成内容的算子。"
"该算子使用用户定义的提示模板(StrFormatPrompt 或 DIYPrompt),"
"结合输入数据中的字段自动构造完整提示词并调用大语言模型生成结果。
\n\n
"
"输入参数:
\n
"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口,用于执行文本生成任务
\n
"
"- prompt_template:提示词模板对象(StrFormatPrompt 或 DIYPromptABC),用于定义提示结构
\n
"
"- input_keys:输入字段映射字典,用于将DataFrame中的列名映射到模板字段
\n
"
"- output_key:输出生成内容字段名,默认为'generated_content'
\n\n
"
"输出参数:
\n
"
"- 包含生成结果的新DataFrame
\n
"
"- 返回输出字段名,以便后续算子引用
\n\n
"
"使用场景:
\n
"
"适用于需要通过模板化提示构建多样输入、批量生成文本内容的场景,例如标题生成、摘要生成、问答模板填充等。"
)
elif
lang
==
"en"
:
return
(
"An operator for content generation based on templated prompts. "
"This operator uses a user-defined prompt template (StrFormatPrompt or DIYPromptABC) "
"to automatically construct full prompts from input data fields and generate outputs via an LLM.
\n\n
"
"Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC interface, responsible for text generation
\n
"
"- prompt_template: Prompt template object (StrFormatPrompt or DIYPromptABC) defining the prompt structure
\n
"
"- input_keys: Dictionary mapping DataFrame column names to template fields
\n
"
"- output_key: Field name for generated content, default is 'generated_content'
\n\n
"
"Output Parameters:
\n
"
"- DataFrame containing generated outputs
\n
"
"- Returns the output field name for downstream operator reference
\n\n
"
"Use Case:
\n
"
"Ideal for tasks requiring templated prompt-driven generation, such as title generation, text summarization, or Q&A filling."
)
else
:
return
(
"PromptTemplatedGenerator generates text based on a user-defined prompt template."
)
\ No newline at end of file
dataflow/operators/core_text/generate/prompted_generator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
@
OPERATOR_REGISTRY
.
register
()
class
PromptedGenerator
(
OperatorABC
):
'''
Answer Generator is a class that generates answers for given questions.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
system_prompt
:
str
=
"You are a helpful agent."
,
json_schema
:
dict
=
None
,
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
json_schema
=
json_schema
self
.
system_prompt
=
system_prompt
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。"
"输入参数:
\n
"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口
\n
"
"- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'
\n
"
"- input_key:输入内容字段名,默认为'raw_content'
\n
"
"- output_key:输出生成内容字段名,默认为'generated_content'
\n
"
"输出参数:
\n
"
"- 包含生成内容的DataFrame
\n
"
"- 返回输出字段名,用于后续算子引用"
)
elif
lang
==
"en"
:
return
(
"Generate data from user-provided prompts. Combines system prompt and input content to generate desired output text.
\n
"
"Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC interface
\n
"
"- system_prompt: System prompt to define model behavior, default is 'You are a helpful agent.'
\n
"
"- input_key: Field name for input content, default is 'raw_content'
\n
"
"- output_key: Field name for output generated content, default is 'generated_content'
\n\n
"
"Output Parameters:
\n
"
"- DataFrame containing generated content
\n
"
"- Returns output field name for subsequent operator reference"
)
else
:
return
(
"PromptedGenerator generates text based on system prompt and input content."
)
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
,
output_key
:
str
=
"generated_content"
):
self
.
input_key
,
self
.
output_key
=
input_key
,
output_key
self
.
logger
.
info
(
"Running PromptGenerator..."
)
# Load the raw dataframe from the input file
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
# Create a list to hold all generated questions and answers
llm_inputs
=
[]
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for
index
,
row
in
dataframe
.
iterrows
():
raw_content
=
row
.
get
(
self
.
input_key
,
''
)
if
raw_content
:
llm_input
=
self
.
system_prompt
+
str
(
raw_content
)
llm_inputs
.
append
(
llm_input
)
# Generate the text using the model
try
:
self
.
logger
.
info
(
"Generating text using the model..."
)
if
self
.
json_schema
is
not
None
:
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
,
json_schema
=
self
.
json_schema
)
else
:
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
)
self
.
logger
.
info
(
"Text generation completed."
)
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error during text generation:
{
e
}
"
)
return
# Add the generated content back to the dataframe
dataframe
[
self
.
output_key
]
=
generated_outputs
# Save the updated dataframe to the output file
output_file
=
storage
.
write
(
dataframe
)
return
output_key
dataflow/operators/core_text/generate/random_domain_knowledge_row_generator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.core.prompt
import
prompt_restrict
,
DIYPromptABC
from
dataflow.prompts.general_text
import
SFTFromScratchGeneratorPrompt
from
typing
import
Union
@
prompt_restrict
(
SFTFromScratchGeneratorPrompt
)
@
OPERATOR_REGISTRY
.
register
()
class
RandomDomainKnowledgeRowGenerator
(
OperatorABC
):
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
generation_num
:
int
,
domain_keys
:
str
,
prompt_template
:
Union
[
SFTFromScratchGeneratorPrompt
,
DIYPromptABC
]
=
None
,
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
prompt_template
=
prompt_template
self
.
generation_num
=
generation_num
self
.
domain_keys
=
domain_keys
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"RandomDomainKnowledgeRowGenerator算子用于结合提示模板(prompt_template)与LLM服务对象(llm_serving),"
"批量生成与指定领域相关的文本内容。
\n\n
"
"功能说明:
\n
"
"- 结合SFTFromScratchGeneratorPrompt模板,根据domain_keys随机选择领域并生成内容;
\n
"
"- 当输入DataFrame为空时,可通过generation_num参数控制生成样本数量;
\n
"
"- 生成的文本结果将写入指定字段(output_key),并返回该字段名供后续算子使用。
\n\n
"
"参数说明:
\n
"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口;
\n
"
"- prompt_template:提示模板实例,需为SFTFromScratchGeneratorPrompt类型;
\n
"
"- storage:DataFlowStorage对象,用于读取与写入数据;
\n
"
"- output_key:生成结果写入的字段名,默认为'generated_content';
\n
"
"- generation_num:生成内容数量,默认为1;
\n
"
"- domain_keys:指定或限制生成内容所属领域。
\n\n
"
"输出说明:
\n
"
"- 返回值:输出字段名(output_key),供后续算子引用;
\n
"
"- 同时将包含生成内容的新DataFrame写回至存储。"
)
elif
lang
==
"en"
:
return
(
"The RandomDomainKnowledgeRowGenerator operator generates domain-related text content "
"by combining a prompt template (prompt_template) with an LLM serving instance (llm_serving).
\n\n
"
"Function Description:
\n
"
"- Utilizes the SFTFromScratchGeneratorPrompt template to randomly select domains via domain_keys;
\n
"
"- Supports content generation when no input DataFrame is available, controlled by generation_num;
\n
"
"- Generated text is written to the specified output field (output_key), and the field name is returned.
\n\n
"
"Parameter Description:
\n
"
"- llm_serving: LLM serving object implementing the LLMServingABC interface;
\n
"
"- prompt_template: Prompt template instance of type SFTFromScratchGeneratorPrompt;
\n
"
"- storage: DataFlowStorage object used for reading and writing data;
\n
"
"- output_key: Name of the field to write generated results (default: 'generated_content');
\n
"
"- generation_num: Number of contents to generate when there is no input data (default: 1);
\n
"
"- domain_keys: Domain key(s) specifying or constraining the generation domain; empty string for random.
\n\n
"
"Output Description:
\n
"
"- Returns the output field name (output_key) for downstream reference;
\n
"
"- Writes the DataFrame containing generated content back to storage."
)
else
:
return
(
"RandomDomainKnowledgeRowGenerator算子用于结合提示模板(prompt_template)与LLM服务对象(llm_serving),批量生成领域文本内容。"
)
def
run
(
self
,
storage
:
DataFlowStorage
,
output_key
:
str
=
"generated_content"
):
"""
主流程:基于输入数据和提示词生成文本内容。
参数说明:
- storage: DataFlowStorage对象,用于读写数据;
- output_key: 输出字段名,默认为'generated_content';
- generation_num: 生成内容的数量,默认为1;
返回:
- 输出字段名(output_key),供后续算子引用。
"""
self
.
output_key
=
output_key
self
.
logger
.
info
(
"Running RandomDomainKnowledgeRowGenerator..."
)
# 从存储中读取DataFrame
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loaded data, number of rows:
{
len
(
dataframe
)
}
"
)
llm_inputs
=
[]
# 按generation_num生成指定数量的输入
for
i
in
range
(
self
.
generation_num
):
llm_inputs
.
append
(
self
.
prompt_template
.
build_prompt
(
self
.
domain_keys
))
try
:
self
.
logger
.
info
(
"Generating text using the model..."
)
# 调用LLM服务生成文本
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
)
self
.
logger
.
info
(
"Text generation completed."
)
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error during text generation:
{
e
}
"
)
return
# 将生成的内容写入DataFrame新列
dataframe
[
self
.
output_key
]
=
generated_outputs
# 将结果写回存储
output_file
=
storage
.
write
(
dataframe
)
return
output_key
dataflow/operators/core_text/generate/retrieval_generator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.serving.light_rag_serving
import
LightRAGServing
@
OPERATOR_REGISTRY
.
register
()
class
RetrievalGenerator
(
OperatorABC
):
def
__init__
(
self
,
llm_serving
:
LightRAGServing
,
system_prompt
:
str
=
"You are a helpful agent."
,
json_schema
:
dict
=
None
,
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
json_schema
=
json_schema
self
.
system_prompt
=
system_prompt
async
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
,
output_key
:
str
=
"generated_content"
,
):
self
.
input_key
,
self
.
output_key
=
input_key
,
output_key
self
.
logger
.
info
(
"Running RetrievalGenerator..."
)
# Load the raw dataframe from the input file
df
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of tasks:
{
len
(
df
)
}
"
)
llm_inputs
=
[]
for
index
,
row
in
df
.
iterrows
():
raw_content
=
row
.
get
(
self
.
input_key
,
''
)
if
raw_content
:
llm_input
=
str
(
raw_content
)
llm_inputs
.
append
(
llm_input
)
try
:
self
.
logger
.
info
(
"Generating text using the model..."
)
generated_outputs
=
await
self
.
llm_serving
.
generate_from_input
(
llm_inputs
,
self
.
system_prompt
)
self
.
logger
.
info
(
"Text generation completed."
)
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error during text generation:
{
e
}
"
)
return
df
[
self
.
output_key
]
=
generated_outputs
output_file
=
storage
.
write
(
df
)
return
output_key
\ No newline at end of file
dataflow/operators/core_text/generate/text2multihopqa_generator.py
0 → 100644
View file @
97e8278b
from
dataflow.prompts.text2qa
import
Text2MultiHopQAGeneratorPrompt
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
import
random
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Sequence
import
json
from
tqdm
import
tqdm
import
re
from
dataflow.core.prompt
import
prompt_restrict
,
DIYPromptABC
from
typing
import
Union
import
re
@
prompt_restrict
(
Text2MultiHopQAGeneratorPrompt
)
@
OPERATOR_REGISTRY
.
register
()
class
Text2MultiHopQAGenerator
(
OperatorABC
):
r
"""A processor for generating multi-hop question-answer pairs from user
data.
This class handles the processing of text data to generate multi-hop
question-answer pairs using either an AI model or rule-based approaches.
It manages the entire pipeline from text preprocessing to dataset curation.
"""
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
seed
:
int
=
0
,
lang
=
"en"
,
prompt_template
:
Union
[
Text2MultiHopQAGeneratorPrompt
,
DIYPromptABC
]
=
None
,
num_q
=
5
):
r
"""Initialize the UserDataProcessor.
Args:
config (Optional[ProcessorConfig], optional): Configuration for
data processing. (default: :obj:`None`)
"""
self
.
rng
=
random
.
Random
(
seed
)
self
.
llm_serving
=
llm_serving
self
.
lang
=
lang
self
.
logger
=
get_logger
()
self
.
num_q
=
num_q
if
prompt_template
:
self
.
prompt_template
=
prompt_template
else
:
self
.
prompt_template
=
Text2MultiHopQAGeneratorPrompt
(
lang
=
self
.
lang
)
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
)
->
tuple
:
"""Returns a description of the processor's functionality.
Args:
lang (str, optional): Language for description ('zh' or 'en').
Returns:
tuple: Description strings in specified language, including format example
"""
if
lang
==
"zh"
:
return
(
"MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。"
,
"处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。"
,
"输出格式如下:"
,
"输入:
\n
"
"text: <原始上下文文本>"
,
"输出:
\n
"
"{
\n
"
"
\"
text
\"
: <处理后的文本字符串>,
\n
"
"
\"
qa_pairs
\"
: [
\n
"
" {
\n
"
"
\"
question
\"
: <字符串:生成的问题>,
\n
"
"
\"
reasoning_steps
\"
: [
\n
"
" {
\"
step
\"
: <推理过程的步骤 1>},
\n
"
" {
\"
step
\"
: <步骤 2>} ...
\n
"
" ],
\n
"
"
\"
answer
\"
: <字符串:最终答案>,
\n
"
"
\"
supporting_facts
\"
: [<支持该答案的事实 1>, <事实 2>, ...],
\n
"
"
\"
type
\"
: <可选:问题类型,如“生物学”、“历史”等>
\n
"
" },
\n
"
" ...
\n
"
" ],
\n
"
"
\"
metadata
\"
: {
\n
"
"
\"
source
\"
: <数据来源>,
\n
"
"
\"
timestamp
\"
: <时间戳字符串>,
\n
"
"
\"
complexity
\"
: <整数:问题复杂度标记>
\n
"
" }
\n
"
"}"
)
else
:
return
(
"MultiHopQAGenerator is a processor for generating multi-hop question-answer pairs from raw text."
,
"It includes preprocessing, information extraction, and reasoning-based QA generation, with configurable LLM backends."
,
"Expected output format:"
,
"Input:
\n
"
"text: <raw input context>"
,
"Output:
\n
"
"{
\n
"
"
\"
text
\"
: <processed input text>,
\n
"
"
\"
qa_pairs
\"
: [
\n
"
" {
\n
"
"
\"
question
\"
: <string: generated question>,
\n
"
"
\"
reasoning_steps
\"
: [
\n
"
" {
\"
step
\"
: <inference step 1>},
\n
"
" {
\"
step
\"
: <inference step 2>} ...
\n
"
" ],
\n
"
"
\"
answer
\"
: <string: final answer>,
\n
"
"
\"
supporting_facts
\"
: [<fact 1>, <fact 2>, ...],
\n
"
"
\"
type
\"
: <optional string: QA category>
\n
"
" },
\n
"
" ...
\n
"
" ],
\n
"
"
\"
metadata
\"
: {
\n
"
"
\"
source
\"
: <source string>,
\n
"
"
\"
timestamp
\"
: <timestamp string>,
\n
"
"
\"
complexity
\"
: <integer: reasoning complexity>
\n
"
" }
\n
"
"}"
)
def
process_text
(
self
,
text
:
str
,
source
:
str
=
"user_input"
)
->
List
[
Dict
[
str
,
Any
]]:
r
"""Process a single text to generate multi-hop QA pairs.
Args:
text (str): The input text to process.
source (str, optional): Source identifier for the text.
(default: :obj:`"user_input"`)
Returns:
List[Dict[str, Any]]: List of processed examples with QA pairs and
metadata.
"""
# Convert text to standard format
raw_data
=
[
{
'text'
:
text
,
'source'
:
source
,
}
]
# Construct examples
constructor
=
ExampleConstructor
(
lang
=
self
.
lang
,
llm_serving
=
self
.
llm_serving
)
examples
=
constructor
.
construct_examples
(
raw_data
)
# Manage data
# curator = DataCurator(self.config, self.rng)
# final_dataset = curator.curate_dataset(examples)
return
examples
def
process_batch
(
self
,
texts
:
List
[
str
],
sources
:
Optional
[
List
[
str
]]
=
None
)
->
List
[
Dict
[
str
,
Any
]]:
r
"""Process multiple texts in batch to generate multi-hop QA pairs.
Args:
texts (List[str]): List of input texts to process.
sources (Optional[List[str]], optional): List of source
identifiers. (default: :obj:`None`)
Returns:
List[Dict[str, Any]]: List of processed examples with QA pairs and
metadata.
Raises:
ValueError: If length of sources doesn't match length of texts.
"""
if
sources
is
None
:
sources
=
[
"default_source"
]
*
len
(
texts
)
elif
len
(
sources
)
!=
len
(
texts
):
raise
ValueError
(
"Length of sources must match length of texts"
)
raw_data
=
[
{
'text'
:
text
,
'source'
:
source
,
}
for
text
,
source
in
zip
(
texts
,
sources
)
]
# Construct examples
constructor
=
ExampleConstructor
(
lang
=
self
.
lang
,
llm_serving
=
self
.
llm_serving
,
prompt_template
=
self
.
prompt_template
)
examples
=
constructor
.
construct_examples
(
raw_data
)
# # Manage data
# curator = DataCurator(self.config, self.rng)
# final_dataset = curator.curate_dataset(examples)
return
examples
def
_validate_dataframe
(
self
,
dataframe
:
pd
.
DataFrame
):
required_keys
=
[
self
.
input_key
]
forbidden_keys
=
[
self
.
output_key
]
missing
=
[
k
for
k
in
required_keys
if
k
not
in
dataframe
.
columns
]
conflict
=
[
k
for
k
in
forbidden_keys
if
k
in
dataframe
.
columns
]
if
missing
:
raise
ValueError
(
f
"Missing required column(s):
{
missing
}
"
)
if
conflict
:
raise
ValueError
(
f
"The following column(s) already exist and would be overwritten:
{
conflict
}
"
)
def
run
(
self
,
storage
:
DataFlowStorage
=
None
,
input_key
:
str
=
'cleaned_chunk'
,
output_key
:
str
=
'QA_pairs'
,
output_meta_key
:
str
=
'QA_metadata'
,
):
self
.
input_key
,
self
.
output_key
,
self
.
output_meta_key
=
input_key
,
output_key
,
output_meta_key
dataframe
=
storage
.
read
(
"dataframe"
)
self
.
_validate_dataframe
(
dataframe
)
texts
=
dataframe
[
self
.
input_key
].
tolist
()
outputs
=
self
.
process_batch
(
texts
)
dataframe
[
self
.
output_key
]
=
[
output
[
'qa_pairs'
][:
self
.
num_q
]
if
len
(
output
[
'qa_pairs'
])
>=
self
.
num_q
else
output
[
'qa_pairs'
]
for
output
in
outputs
]
dataframe
[
self
.
output_meta_key
]
=
[
output
[
'metadata'
]
for
output
in
outputs
]
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Results saved to
{
output_file
}
"
)
return
[
output_key
]
class
ExampleConstructor
:
r
"""Constructs training examples from raw text data.
This class handles the construction of training examples by preprocessing
text, extracting information pairs, and generating question-answer pairs.
"""
def
__init__
(
self
,
lang
:
str
=
"en"
,
llm_serving
:
LLMServingABC
=
None
,
min_text_length
:
int
=
100
,
max_text_length
:
int
=
200000
,
prompt_template
=
None
):
r
"""Initialize the ExampleConstructor.
Args:
config (ProcessorConfig): Configuration for example construction.
multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
Agent for generating multi-hop QA pairs. (default: :obj:`None`)
"""
self
.
lang
=
lang
self
.
llm_sering
=
llm_serving
self
.
logger
=
get_logger
()
self
.
max_length
=
max_text_length
self
.
min_length
=
min_text_length
# self.prompt = Text2MultiHopQAGeneratorPrompt(lang=self.lang)
if
prompt_template
:
self
.
prompt_template
=
prompt_template
else
:
self
.
prompt_template
=
Text2MultiHopQAGeneratorPrompt
(
lang
=
self
.
lang
)
def
construct_examples
(
self
,
raw_data
:
List
[
Dict
[
str
,
Any
]]
)
->
List
[
Dict
[
str
,
Any
]]:
r
"""Construct training examples from raw data.
Args:
raw_data (List[Dict[str, Any]]): List of raw data dictionaries
containing text and metadata.
Returns:
List[Dict[str, Any]]: List of constructed examples with QA pairs
and metadata.
"""
self
.
logger
.
info
(
"Starting to construct examples..."
)
examples
=
[]
for
data
in
tqdm
(
raw_data
,
desc
=
"Constructing examples"
):
# 1. Text preprocessing
processed_text
=
self
.
_preprocess_text
(
data
.
get
(
'text'
,
''
))
if
not
processed_text
:
example
=
{
# 'text': processed_text,
'qa_pairs'
:
[],
'metadata'
:
{
'source'
:
data
.
get
(
'source'
,
'unknown'
),
'timestamp'
:
data
.
get
(
'timestamp'
,
''
),
'complexity'
:
0
,
},
}
examples
.
append
(
example
)
continue
# 2. Generate key information pairs
info_pairs
=
self
.
_extract_info_pairs
(
processed_text
)
# 3. Construct question-answer pairs
if
(
info_pairs
):
qa_pairs
=
self
.
_generate_qa_pairs
(
info_pairs
)
else
:
qa_pairs
=
[]
# 4. Add metadata
example
=
{
# 'text': processed_text,
'qa_pairs'
:
qa_pairs
,
'metadata'
:
{
'source'
:
data
.
get
(
'source'
,
'unknown'
),
'timestamp'
:
data
.
get
(
'timestamp'
,
''
),
'complexity'
:
self
.
_calculate_complexity
(
qa_pairs
)
if
qa_pairs
else
0
,
},
}
examples
.
append
(
example
)
self
.
logger
.
info
(
f
"Successfully constructed
{
len
(
examples
)
}
examples"
)
return
examples
def
_preprocess_text
(
self
,
text
:
str
)
->
str
:
r
"""Preprocess input text for example construction.
Args:
text (str): Input text to preprocess.
Returns:
str: Preprocessed text, or empty string if text fails quality
checks.
"""
if
not
isinstance
(
text
,
str
):
return
''
# 1. Basic cleaning
text
=
text
.
strip
()
# 2. Length check
if
(
len
(
text
)
<
self
.
min_length
or
len
(
text
)
>
self
.
max_length
):
self
.
logger
.
warning
(
"text fail to pass length check."
)
return
''
# 3. Quality check
if
not
self
.
_check_text_quality
(
text
):
self
.
logger
.
warning
(
"text fail to pass quality check."
)
return
''
return
text
def
_calculate_special_char_ratio
(
self
,
text
):
# 中文字符的Unicode范围(基本汉字+扩展)
chinese_ranges
=
[
(
0x4E00
,
0x9FFF
),
# 基本汉字
(
0x3400
,
0x4DBF
),
# 扩展A
(
0x20000
,
0x2A6DF
),
# 扩展B
(
0x2A700
,
0x2B73F
),
# 扩展C
(
0x2B740
,
0x2B81F
),
# 扩展D
(
0x2B820
,
0x2CEAF
)
# 扩展E
]
special_count
=
0
for
c
in
text
:
# 检查是否为中文、字母数字或空格
is_chinese
=
any
(
start
<=
ord
(
c
)
<=
end
for
start
,
end
in
chinese_ranges
)
if
not
(
c
.
isalnum
()
or
c
.
isspace
()
or
is_chinese
):
special_count
+=
1
return
special_count
/
len
(
text
)
if
text
else
0
def
_check_text_quality
(
self
,
text
:
str
)
->
bool
:
r
"""Check the quality of input text.
Args:
text (str): Text to check quality for.
Returns:
bool: True if text passes quality checks, False otherwise.
"""
# 1. Basic quality check
if
(
text
.
count
(
'。'
)
<
2
and
text
.
count
(
'.'
)
<
2
):
# Must have at least 2 sentences
return
False
# 2. Special character ratio check
special_char_ratio
=
self
.
_calculate_special_char_ratio
(
text
)
if
special_char_ratio
>
0.3
:
# No more than 30% special characters
return
False
return
True
def
_extract_info_pairs
(
self
,
text
:
str
)
->
List
[
Dict
[
str
,
Sequence
[
str
]]]:
r
"""Extract information pairs and relationships from text.
Args:
text (str): Input text to extract information from.
Returns:
List[Dict[str, Sequence[str]]]: List of dictionaries containing
premise, intermediate, conclusion, and related contexts.
"""
# Split into sentences
if
(
self
.
lang
==
"en"
):
sentences
=
[
s
.
strip
()
for
s
in
text
.
split
(
'.'
)
if
s
.
strip
()]
else
:
sentences
=
[
s
.
strip
()
for
s
in
text
.
split
(
'。'
)
if
s
.
strip
()]
info_pairs
=
[]
# Extract combinations of multiple related sentences
for
i
in
range
(
len
(
sentences
)
-
2
):
if
len
(
sentences
[
i
])
>
10
and
len
(
sentences
[
i
+
1
])
>
10
:
info_pairs
.
append
(
{
'premise'
:
sentences
[
i
],
'intermediate'
:
sentences
[
i
+
1
],
'conclusion'
:
sentences
[
i
+
2
]
if
i
+
2
<
len
(
sentences
)
else
''
,
'related_contexts'
:
[
s
for
j
,
s
in
enumerate
(
sentences
)
if
j
!=
i
and
j
!=
i
+
1
and
len
(
s
)
>
10
][:
2
],
# Limit to 2 additional related contexts
}
)
return
info_pairs
def
_generate_qa_pairs
(
self
,
info_pairs
:
List
[
Dict
[
str
,
Sequence
[
str
]]]
)
->
List
[
Dict
[
str
,
str
]]:
r
"""Generate multi-hop question-answer pairs from information pairs.
Args:
info_pairs (List[Dict[str, Sequence[str]]]): List of information
pairs extracted from text.
Returns:
List[Dict[str, str]]: List of generated QA pairs.
"""
user_inputs
=
[]
for
pair
in
info_pairs
:
# 1. Generate multi-hop question-answer pair using AI
# Construct full context
context
=
(
f
"
{
pair
[
'premise'
]
}
.
{
pair
[
'intermediate'
]
}
."
f
"
{
pair
[
'conclusion'
]
}
"
)
user_inputs
.
append
(
self
.
prompt_template
.
build_prompt
(
context
))
sys_prompt
=
self
.
prompt_template
.
build_system_prompt
()
responses
=
self
.
llm_sering
.
generate_from_input
(
user_inputs
=
user_inputs
,
system_prompt
=
sys_prompt
)
qa_pairs
=
self
.
_extract_qa_pairs
(
responses
)
return
qa_pairs
def
_extract_qa_pairs
(
self
,
responses
:
List
[
str
])
->
List
[
Dict
[
str
,
Any
]]:
"""
从原始响应中精确提取符合结构的QA对
自动跳过非法JSON和干扰文本
"""
qa_pairs
=
[]
for
response
in
responses
:
# self.logger.info(f"generated qa: {response}")
# 方法1:尝试直接解析整个响应为JSON
try
:
qa_pair
=
json
.
loads
(
response
)
if
isinstance
(
qa_pair
,
dict
)
and
"question"
in
qa_pair
:
qa_pairs
.
append
(
qa_pair
)
continue
elif
isinstance
(
qa_pair
,
list
):
for
item
in
qa_pair
:
if
isinstance
(
item
,
dict
)
and
"question"
in
item
:
qa_pairs
.
append
(
item
)
continue
except
json
.
JSONDecodeError
:
pass
# 方法2:使用正则表达式查找所有JSON对象
try
:
# 查找所有以 { 开始的JSON对象
json_pattern
=
r
'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
# 更精确的模式,匹配完整的JSON对象
brace_count
=
0
start_pos
=
-
1
json_objects
=
[]
for
i
,
char
in
enumerate
(
response
):
if
char
==
'{'
:
if
brace_count
==
0
:
start_pos
=
i
brace_count
+=
1
elif
char
==
'}'
:
brace_count
-=
1
if
brace_count
==
0
and
start_pos
!=
-
1
:
json_str
=
response
[
start_pos
:
i
+
1
]
json_objects
.
append
(
json_str
)
start_pos
=
-
1
# 尝试解析找到的每个JSON字符串
for
json_str
in
json_objects
:
try
:
qa_pair
=
json
.
loads
(
json_str
)
if
(
isinstance
(
qa_pair
,
dict
)
and
\
"question"
in
qa_pair
and
\
"reasoning_steps"
in
qa_pair
and
\
"answer"
in
qa_pair
and
\
"supporting_facts"
in
qa_pair
and
\
"type"
in
qa_pair
):
qa_pairs
.
append
(
qa_pair
)
self
.
logger
.
info
(
f
"Successfully extracted QA pair:
{
qa_pair
[
'question'
]
}
"
)
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
debug
(
f
"Failed to parse JSON object:
{
json_str
[:
100
]
}
... Error:
{
e
}
"
)
continue
# 对qa_pairs中重复的question进行去重
if
qa_pairs
:
seen_questions
=
set
()
unique_qa_pairs
=
[]
for
qa_pair
in
qa_pairs
:
question
=
qa_pair
.
get
(
"question"
,
""
).
strip
().
lower
()
if
question
and
question
not
in
seen_questions
:
seen_questions
.
add
(
question
)
unique_qa_pairs
.
append
(
qa_pair
)
self
.
logger
.
debug
(
f
"Added unique question:
{
qa_pair
[
'question'
]
}
"
)
else
:
self
.
logger
.
debug
(
f
"Skipped duplicate question:
{
qa_pair
.
get
(
'question'
,
'N/A'
)
}
"
)
qa_pairs
=
unique_qa_pairs
self
.
logger
.
info
(
f
"After deduplication:
{
len
(
qa_pairs
)
}
unique QA pairs"
)
# 如果没有找到有效的JSON对象,记录警告
if
not
json_objects
:
self
.
logger
.
warning
(
"No JSON objects found in model response."
)
except
Exception
as
e
:
self
.
logger
.
warning
(
f
"Failed to parse QA information from model response. Error:
{
e
}
"
)
return
qa_pairs
def
_calculate_complexity
(
self
,
qa_pairs
:
List
[
Dict
[
str
,
Any
]])
->
float
:
r
"""Calculate the complexity score for a set of QA pairs.
Args:
qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
complexity for.
Returns:
float: Complexity score between 0.0 and 1.0.
"""
if
not
qa_pairs
:
return
0.0
# Calculate complexity based on multiple factors
complexities
=
[]
for
qa
in
qa_pairs
:
# 1. Number of reasoning steps
reasoning_steps_count
=
len
(
qa
.
get
(
'reasoning_steps'
,
[]))
# 2. Number of supporting facts
supporting_facts_count
=
len
(
qa
.
get
(
'supporting_facts'
,
[]))
# 3. Question length
question_length
=
len
(
qa
.
get
(
'question'
,
''
).
split
())
# 4. Answer length
answer_length
=
len
(
qa
.
get
(
'answer'
,
''
).
split
())
# Calculate complexity of a single QA pair
qa_complexity
=
(
min
(
reasoning_steps_count
/
3
,
1.0
)
*
0.4
# Weight for reasoning steps
+
min
(
supporting_facts_count
/
3
,
1.0
)
*
0.3
# Weight for supporting facts
+
min
(
question_length
/
20
,
1.0
)
*
0.15
# Weight for question length
+
min
(
answer_length
/
50
,
1.0
)
*
0.15
# Weight for answer length
)
complexities
.
append
(
qa_complexity
)
return
sum
(
complexities
)
/
len
(
complexities
)
dataflow/operators/core_text/generate/text2qa_generator.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
from
dataflow.core.prompt
import
prompt_restrict
import
ast
import
json
from
dataflow.prompts.text2qa
import
Text2QASeedQuestionGeneratorPrompt
,
Text2QAAutoPromptGeneratorPrompt
@
prompt_restrict
(
Text2QAAutoPromptGeneratorPrompt
,
Text2QASeedQuestionGeneratorPrompt
)
@
OPERATOR_REGISTRY
.
register
()
class
Text2QAGenerator
:
'''
SeedQAGenerator is a class that uses LLMs to generate QA pairs based on seed input.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
# prompt_template = None # prompt is fix
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
prompt_template
=
Text2QAAutoPromptGeneratorPrompt
()
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子用于为给定的文档片段生成种子QA对。
\n\n
"
"输入参数:
\n
"
"- input_key: 包含文档片段的字段名
\n
"
"- prompt_key: 包含提示词的字段名
\n
"
"- output_quesion_key: 包含生成问题的字段名
\n
"
"- output_answer_key: 包含生成答案的字段名
\n
"
)
elif
lang
==
"en"
:
return
(
"This operator generates seed QA pairs for given document fragments.
\n\n
"
"Input Parameters:
\n
"
"- input_key: Field name containing the content
\n
"
"- prompt_key: Field name containing the generated prompt
\n
"
"- output_quesion_key: Field name containing the generated question
\n
"
"- output_answer_key: Field name containing the generated answer
\n
"
)
else
:
return
"QAGenerator generates QA pairs for given document fragments."
def
_validate_dataframe
(
self
,
dataframe
:
pd
.
DataFrame
):
required_keys
=
[
self
.
input_key
]
forbidden_keys
=
[
self
.
output_question_key
,
self
.
output_answer_key
]
missing
=
[
k
for
k
in
required_keys
if
k
not
in
dataframe
.
columns
]
conflict
=
[
k
for
k
in
forbidden_keys
if
k
in
dataframe
.
columns
]
if
missing
:
raise
ValueError
(
f
"Missing required column(s):
{
missing
}
"
)
if
conflict
:
raise
ValueError
(
f
"The following column(s) already exist and would be overwritten:
{
conflict
}
"
)
def
_build_prompt
(
self
,
df
,
types
):
if
types
==
"prompt"
:
self
.
prompt_template
=
Text2QAAutoPromptGeneratorPrompt
()
texts
=
df
[
self
.
input_key
].
tolist
()
output
=
[
self
.
prompt_template
.
build_prompt
(
text
)
for
text
in
texts
]
elif
types
==
"qa"
:
self
.
prompt_template
=
Text2QASeedQuestionGeneratorPrompt
()
output
=
[]
for
index
,
row
in
df
.
iterrows
():
output
.
append
(
row
[
self
.
output_prompt_key
]
+
self
.
prompt_template
.
build_prompt
()
+
row
[
self
.
input_key
])
return
output
def
_parse_qa
(
self
,
response
:
str
)
->
tuple
:
lines
=
response
.
strip
().
split
(
'
\n
'
)
q
=
next
((
line
[
2
:].
strip
()
for
line
in
lines
if
line
.
lower
().
startswith
(
"q:"
)),
""
)
a
=
next
((
line
[
2
:].
strip
()
for
line
in
lines
if
line
.
lower
().
startswith
(
"a:"
)),
""
)
return
q
,
a
def
parse_list_string
(
self
,
s
:
str
)
->
list
:
# 去掉前后的 [ ]
s
=
s
.
strip
()[
1
:
-
1
]
# 去掉多余逗号并按 , 切分
items
=
[
item
.
strip
()
for
item
in
s
.
split
(
","
)
if
item
.
strip
()]
return
items
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"text"
,
input_question_num
:
int
=
1
,
output_prompt_key
:
str
=
"generated_prompt"
,
output_quesion_key
:
str
=
"generated_question"
,
output_answer_key
:
str
=
"generated_answer"
):
'''
Runs the QA generation process, reading from the input file and saving results to output.
'''
self
.
input_key
,
self
.
input_question_num
,
self
.
output_prompt_key
,
self
.
output_question_key
,
self
.
output_answer_key
=
input_key
,
input_question_num
,
output_prompt_key
,
output_quesion_key
,
output_answer_key
dataframe
=
storage
.
read
(
"dataframe"
)
self
.
_validate_dataframe
(
dataframe
)
formatted_prompts
=
self
.
_build_prompt
(
dataframe
,
"prompt"
)
raw_prompts
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
formatted_prompts
,
system_prompt
=
""
)
prompts
=
[]
for
i
,
p
in
enumerate
(
raw_prompts
):
try
:
prompts
.
append
(
json
.
loads
(
p
))
except
json
.
JSONDecodeError
:
self
.
logger
.
warning
(
f
"Failed to parse prompt at index
{
i
}
:
{
p
}
"
)
continue
expanded_rows
=
[]
expanded_prompts
=
[]
for
idx
,
prompt_list
in
enumerate
(
prompts
):
for
p
in
prompt_list
[:
min
(
self
.
input_question_num
,
len
(
prompt_list
))]:
expanded_rows
.
append
(
dataframe
.
iloc
[
idx
].
to_dict
())
# 复制该行
expanded_prompts
.
append
(
p
)
# 对应的 prompt
dataframe
=
pd
.
DataFrame
(
expanded_rows
)
dataframe
[
self
.
output_prompt_key
]
=
expanded_prompts
formatted_prompts
=
self
.
_build_prompt
(
dataframe
,
"qa"
)
responses
=
self
.
llm_serving
.
generate_from_input
(
user_inputs
=
formatted_prompts
,
system_prompt
=
""
)
questions
,
answers
=
zip
(
*
[
self
.
_parse_qa
(
r
)
for
r
in
responses
])
dataframe
[
self
.
output_question_key
]
=
questions
dataframe
[
self
.
output_answer_key
]
=
answers
output_file
=
storage
.
write
(
dataframe
)
self
.
logger
.
info
(
f
"Results saved to
{
output_file
}
"
)
return
[
self
.
output_question_key
,
self
.
output_answer_key
]
dataflow/operators/core_text/refine/pandas_operator.py
0 → 100644
View file @
97e8278b
from
dataflow
import
get_logger
from
dataflow.core
import
OperatorABC
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
import
pandas
as
pd
@
OPERATOR_REGISTRY
.
register
()
class
PandasOperator
(
OperatorABC
):
def
__init__
(
self
,
process_fn
:
list
):
self
.
logger
=
get_logger
()
self
.
process_fn
=
process_fn
self
.
logger
.
info
(
f
"Initializing
{
self
.
__class__
.
__name__
}
with transform functions:
{
self
.
process_fn
}
"
)
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。
\n\n
"
"每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。
\n\n
"
"输入参数:
\n
"
"- process_fn:一个函数列表,每个函数形式为 lambda df: ...,"
"必须返回一个 DataFrame。
\n\n
"
"示例:
\n
"
" - lambda df: df.assign(score2=df['score'] * 2)
\n
"
" - lambda df: df.sort_values('score', ascending=False)"
)
elif
lang
==
"en"
:
return
(
"This operator applies a list of transformation functions to a DataFrame.
\n\n
"
"Each function (typically a lambda) takes a DataFrame and returns a modified DataFrame.
\n\n
"
"Input Parameters:
\n
"
"- process_fn: A list of functions, each in the form of lambda df: ..., "
"and must return a DataFrame.
\n\n
"
"Examples:
\n
"
" - lambda df: df.assign(score2=df['score'] * 2)
\n
"
" - lambda df: df.sort_values('score', ascending=False)"
)
else
:
return
"Applies a sequence of transformation functions to a DataFrame."
def
run
(
self
,
storage
:
DataFlowStorage
):
df
=
storage
.
read
(
"dataframe"
)
for
fn
in
self
.
process_fn
:
if
not
callable
(
fn
):
raise
ValueError
(
"Each transform function must be callable (e.g., lambda df: ...)"
)
df
=
fn
(
df
)
if
not
isinstance
(
df
,
pd
.
DataFrame
):
raise
ValueError
(
"Each transform function must return a DataFrame"
)
self
.
logger
.
info
(
f
"Transformation complete. Final shape:
{
df
.
shape
}
"
)
storage
.
write
(
df
)
return
""
dataflow/operators/core_text/refine/prompted_refiner.py
0 → 100644
View file @
97e8278b
import
pandas
as
pd
from
dataflow.utils.registry
import
OPERATOR_REGISTRY
from
dataflow
import
get_logger
from
dataflow.utils.storage
import
DataFlowStorage
from
dataflow.core
import
OperatorABC
from
dataflow.core
import
LLMServingABC
@
OPERATOR_REGISTRY
.
register
()
class
PromptedRefiner
(
OperatorABC
):
'''
Answer Generator is a class that generates answers for given questions.
'''
def
__init__
(
self
,
llm_serving
:
LLMServingABC
,
system_prompt
:
str
=
"You are a helpful agent."
):
self
.
logger
=
get_logger
()
self
.
llm_serving
=
llm_serving
self
.
system_prompt
=
system_prompt
@
staticmethod
def
get_desc
(
lang
:
str
=
"zh"
):
if
lang
==
"zh"
:
return
(
"PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,"
"并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 "
"`system_prompt + raw_content` 作为模型输入,批量生成改写结果。
\n
"
"
\n
输入参数:
\n
"
"- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口
\n
"
"- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')
\n
"
"- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列
\n
"
"
\n
输出参数:
\n
"
"- 覆盖后的 DataFrame(同名列被改写后的文本)
\n
"
"- 无返回值(结果已通过 DataFlowStorage 写出)
\n
"
"
\n
备注:
\n
"
"- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。
\n
"
"- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。"
)
elif
lang
==
"en"
:
return
(
"PromptedRefiner rewrites/refines/normalizes text in a specified column **in place**, "
"using a provided system_prompt. For each row it concatenates "
"`system_prompt + raw_content` as the model input and generates the refined text.
\n
"
"
\n
Input Parameters:
\n
"
"- llm_serving: LLM serving object implementing LLMServingABC
\n
"
"- system_prompt: Instruction describing the rewrite goal/style (default 'You are a helpful agent.')
\n
"
"- input_key: Column to refine (default 'raw_content'); the refined output **overwrites** this column
\n
"
"
\n
Output:
\n
"
"- DataFrame with the same column overwritten by refined text
\n
"
"- No return value (the result is written via DataFlowStorage)
\n
"
"
\n
Notes:
\n
"
"- This operator **overwrites** the input_key column; copy it first if you need to keep originals.
\n
"
"- Each row is expected to provide text in input_key; missing/empty rows won’t form inputs, which may cause "
"length-mismatch errors on assignment."
)
else
:
return
(
"PromptedRefiner rewrites a chosen column in place using `system_prompt + raw_content` as input."
)
def
run
(
self
,
storage
:
DataFlowStorage
,
input_key
:
str
=
"raw_content"
):
self
.
input_key
=
input_key
self
.
logger
.
info
(
"Running PromptGenerator..."
)
# Load the raw dataframe from the input file
dataframe
=
storage
.
read
(
'dataframe'
)
self
.
logger
.
info
(
f
"Loading, number of rows:
{
len
(
dataframe
)
}
"
)
# Create a list to hold all generated questions and answers
llm_inputs
=
[]
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for
index
,
row
in
dataframe
.
iterrows
():
raw_content
=
row
.
get
(
self
.
input_key
,
''
)
if
raw_content
:
llm_input
=
self
.
system_prompt
+
str
(
raw_content
)
llm_inputs
.
append
(
llm_input
)
# Generate the text using the model
try
:
self
.
logger
.
info
(
"Generating text using the model..."
)
generated_outputs
=
self
.
llm_serving
.
generate_from_input
(
llm_inputs
)
self
.
logger
.
info
(
"Text generation completed."
)
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error during text generation:
{
e
}
"
)
return
# Add the generated content back to the dataframe
dataframe
[
self
.
input_key
]
=
generated_outputs
# Save the updated dataframe to the output file
output_file
=
storage
.
write
(
dataframe
)
return
dataflow/operators/core_vision/__init__.py
0 → 100644
View file @
97e8278b
from
typing
import
TYPE_CHECKING
if
TYPE_CHECKING
:
# generate
from
.generate.prompted_vqa_generator
import
PromptedVQAGenerator
else
:
import
sys
from
dataflow.utils.registry
import
LazyLoader
,
generate_import_structure_from_type_checking
cur_path
=
"dataflow/operators/core_vision/"
_import_structure
=
generate_import_structure_from_type_checking
(
__file__
,
cur_path
)
sys
.
modules
[
__name__
]
=
LazyLoader
(
__name__
,
"dataflow/operators/core_vision/"
,
_import_structure
)
Prev
1
…
3
4
5
6
7
8
9
10
11
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment