Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
import os
import copy
import socket
import webbrowser
from http.server import HTTPServer, SimpleHTTPRequestHandler
from abc import ABC, abstractmethod
from collections import Counter, defaultdict
from dataflow.core.operator import OperatorABC
from dataflow.pipeline.nodes import OperatorNode, KeyNode
from dataflow.wrapper.auto_op import AutoOP, OPRuntime
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OPERATOR_CLASSES, LLM_SERVING_CLASSES
import atexit
from datetime import datetime
from dataflow.logger import get_logger
import colorsys
class PipelineABC(ABC):
def __init__(self):
# list of dict, contains `OPRuntime` class and parameters for `operator.run()`
self.op_runtimes:list[OPRuntime] = []
self.compiled = False
# accumulated keys in each operators, index 0 refers to the keys before the first operator
self.accumulated_keys = [] # list of lists, each sublist contains keys before the operator
# other items
self.logger = get_logger()
self.active_llm_serving = None
# self.serving_resources = defaultdict(dict)
# self.serving_reference_counter = Counter()
self.op_nodes_list : list[OperatorNode] = []
self.llm_serving_list = [] # list of LLMServing objects
self.llm_serving_counter = Counter() # count of LLMServing objects
@abstractmethod
def forward(self):
"""
Main Function to run the pipeline
"""
pass
def compile(self):
self.compiled = True # flag the pipeline as compiled
for k, v in vars(self).items():
if isinstance(v, OperatorABC):
setattr(self, k, AutoOP(v, k, self))
self.forward()
# after call forward, call back function in AutoOP will add the OPRuntime object to self.op_runtimes
self.forward = self._compiled_forward
self.logger.info(
f"Compiling pipeline and validating key integrity "
f"across {len(self.op_runtimes)} operator runtimes."
)
self._build_operator_nodes_graph()
# self._draw_graph_for_operators()
# self._build_serving_resources_map()
def _build_operator_nodes_graph(self):
"""
Build a graph of operator nodes, each node contains the operator object and its storage.
"""
for op_runtime in self.op_runtimes:
llm_serving_obj, storage_obj = None, None
# get llm_serving object from the operator
for _, v in vars(op_runtime.op).items():
if isinstance(v, LLM_SERVING_CLASSES):
llm_serving_obj = v
# get storage object from the function dict
storage_obj = op_runtime.kwargs.pop("storage", None)
assert isinstance(storage_obj, DataFlowStorage), f"Storage must be a DataFlowStorage object, but got {type(storage_obj)} in {op_runtime}'s `run` function with key `storage`."
# create an operator node
op_node = OperatorNode(
op_obj=op_runtime.op,
op_name=op_runtime.op_name,
storage=storage_obj,
llm_serving=llm_serving_obj,
**op_runtime.kwargs
)
# append to lists, if None, just keep it
self.op_nodes_list.append(op_node)
self.llm_serving_list.append(llm_serving_obj)
if llm_serving_obj is not None:
self.llm_serving_counter[llm_serving_obj] += 1
self.logger.debug(f"Built operator nodes graph with {self.op_nodes_list} nodes, \nand {self.llm_serving_list} LLM Serving objects.")
# get keys in the first storage:
first_op = self.op_nodes_list[0] if self.op_nodes_list else None
if first_op and first_op.storage:
iter_storage_keys = first_op.storage.get_keys_from_dataframe()
else:
iter_storage_keys = []
# print("start keys", iter_storage_keys)
# all keys in the first storage will be the initial keys for validation
self.accumulated_keys.append(copy.deepcopy(iter_storage_keys))
error_msg = []
# build graph of all operators and keys from all states
for op_node in self.op_nodes_list:
# check if accumulated_keys have the input keys of this operator
# print(op_node, op_node.input_keys, op_node.output_keys)
for input_key in op_node.input_keys:
if input_key not in self.accumulated_keys[-1]:
error_msg.append(
{
"input_key": input_key,
"op_name": op_node.op_name,
"class_name": op_node.op_obj.__class__.__name__,
"key_para_name": op_node.input_key_nodes[input_key].key_para_name
}
)
# add output keys to accumulated keys
for output_key in op_node.output_keys:
if output_key not in iter_storage_keys:
iter_storage_keys.append(output_key)
self.accumulated_keys.append(copy.deepcopy(iter_storage_keys))
if len(error_msg) != 0:
# final_error_str = "KeyError in following Operators during pipeline.compile():"
details = "\n".join(
f"- Input key '{e['input_key']}' in `{e['op_name']}` "
f"(class <{e['class_name']}>) does not match any output keys "
f"from previous operators or dataset keys. "
f"Check parameter '{e['key_para_name']}' in the `{e['op_name']}.run()`."
for e in error_msg
)
msg = f"Key Matching Error in following Operators during pipeline.compile():\n{details}"
self.logger.warning(msg)
raise KeyError(msg)
self.final_keys = copy.deepcopy(iter_storage_keys)
for i, keys in enumerate(self.accumulated_keys):
# print(i, keys)
pass
self.logger.debug(f"Accumulated keys after building graph: {self.accumulated_keys}")
self.input_dataset_node = OperatorNode(
None,
"DATASET-INPUT",
None,
None,
)
self.input_dataset_node.init_output_keys_nodes(self.accumulated_keys[0])
self.op_nodes_list.insert(0, self.input_dataset_node)
self.output_dataset_node = OperatorNode(
None,
"DATASET-OUTPUT",
None,
None,
)
self.output_dataset_node.init_input_keys_nodes(self.final_keys)
self.op_nodes_list.append(self.output_dataset_node)
# set a default dict for all keys
self.last_modified_index_of_keys: dict[list] = {}
for key in self.final_keys:
self.last_modified_index_of_keys[key] = []
# print(self.last_modified_index_of_keys)
# now the first op node is THEDATASET op
for idx, i_op in enumerate(self.op_nodes_list):
# check for input keys
for input_key in i_op.input_keys:
current_keynode:KeyNode = i_op.input_key_nodes[input_key]
current_keynode.set_index(idx)
if len(self.last_modified_index_of_keys[input_key]) > 0:
last_modified_idx = self.last_modified_index_of_keys[input_key][-1]
last_modified_keynode:KeyNode = self.op_nodes_list[last_modified_idx].output_keys_nodes[input_key]
# double side ptr for each nodes
last_modified_keynode.ptr.append(current_keynode)
current_keynode.ptr.append(last_modified_keynode)
# check for output keys
for output_key in i_op.output_keys:
current_keynode:KeyNode = i_op.output_keys_nodes[output_key]
current_keynode.set_index(idx)
self.last_modified_index_of_keys[output_key].append(idx)
for key, value in self.last_modified_index_of_keys.items():
# print(key, value)
pass
for op in self.op_nodes_list:
# print(op)
self.logger.debug(f"Operator Node: {op}")
pass
# deprecated, use `draw_graph` instead, archived for compatibility
def _draw_graph_for_operators(self):
raise DeprecationWarning(
"The `_draw_graph_for_operators` method is deprecated. "
"Please use `draw_graph` method instead for better visualization.")
def _get_op_node_str(self, node:OperatorNode):
input_keys_string = ""
for i_key_node in node.input_key_nodes.values():
input_keys_string += f"\n{i_key_node.key_para_name}={i_key_node.key}"
output_keys_string = ""
for o_key_node in node.output_keys_nodes.values():
output_keys_string += f"\n{o_key_node.key_para_name}={o_key_node.key}"
# return f"{node.op_name}\n{node.op_obj.__class__.__name__}\n{node.llm_serving.__class__.__name__ if node.llm_serving else 'None'}\n{input_keys_string}\n --- \n{output_keys_string}"
return f"{node.op_name}\n{node.op_obj.__class__.__name__}\n"
try:
import networkx
except:
raise ImportError("Please install networkx to draw graph. Please run `pip install networkx[default]`.")
import matplotlib.pyplot as plt
G = networkx.DiGraph()
# add OP nodes
for op_node in self.op_nodes_list:
G.add_node(op_node, label=_get_op_node_str(self, op_node))
# add edges between OP nodes
for op_node in self.op_nodes_list:
for output_key_nodes in op_node.output_keys_nodes.values():
for ptr_key_node in output_key_nodes.ptr:
G.add_edge(op_node, self.op_nodes_list[ptr_key_node.index], label=ptr_key_node.key)
# draw the figure
pos = networkx.spring_layout(G)
# pos = networkx.drawing.nx_agraph.graphviz_layout(G, prog='dot')
# pos = networkx.kamada_kawai_layout(G)
# pos = networkx.spectral_layout(G)
# 设置画布大小
num_nodes = len(G.nodes)
plt.figure(figsize=(max(10, num_nodes * 0.5), max(8, num_nodes * 0.5)))
# 绘制图形,使用自定义标签
labels = {node: data['label'] for node, data in G.nodes(data=True)}
networkx.draw(G, pos, labels=labels, with_labels=True, node_size=1000, node_shape='s', node_color='lightblue', edge_color='gray', arrows=True)
# 绘制边的标签
edge_labels = networkx.get_edge_attributes(G, 'label')
networkx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
# 保存图形
plt.savefig("operators_graph.png", bbox_inches='tight')
plt.show()
def draw_graph(
self,
port=0,
hide_no_changed_keys=True
):
# compile check
if not self.compiled:
self.logger.error("Pipeline is not compiled yet. Please call `compile()` before drawing the graph.")
raise RuntimeError("Pipeline is not compiled yet. Please call `compile()` before drawing the graph.")
# import check if pyvis is installed
try:
from pyvis.network import Network
except ImportError:
raise ImportError("Please install pyvis to draw graph of current pipeline. Please run `pip install pyvis`.")
def _get_op_node_str(node, step = None):
op_class_name = node.op_obj.__class__.__name__ if node.op_obj.__class__.__name__ !="NoneType" else "Storage/No-Op"
if step is not None:
return f"{node.op_name}\n<{op_class_name}>\n(step={step})\n"
else:
return f"{node.op_name}\n<{op_class_name}>\n"
def _get_op_node_title(node):
input_keys_string = ""
op_class_name = node.op_obj.__class__.__name__ if node.op_obj.__class__.__name__ !="NoneType" else "Storage/No-Op"
if op_class_name == "Storage/No-Op":
for i_key_node in node.input_key_nodes.values():
input_keys_string += f" {i_key_node.key}\n"
output_keys_string = ""
for o_key_node in node.output_keys_nodes.values():
output_keys_string += f" {o_key_node.key}\n"
else:
for i_key_node in node.input_key_nodes.values():
input_keys_string += f" {i_key_node.key_para_name}={i_key_node.key}\n"
output_keys_string = ""
for o_key_node in node.output_keys_nodes.values():
output_keys_string += f" {o_key_node.key_para_name}={o_key_node.key}\n"
if input_keys_string == "":
input_keys_string = " None\n"
if output_keys_string == "":
output_keys_string = " None\n"
return (
f"Attrbute: {node.op_name}\n"
f"Class: {op_class_name}\n"
f"------\n"
f"Input:\n {input_keys_string}"
f"------\n"
f"Output:\n {output_keys_string}"
)
def _hex_to_rgb(hex_color):
hex_color = hex_color.lstrip("#")
return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
def _rgb_to_hex(rgb):
return "#{:02x}{:02x}{:02x}".format(*rgb)
def _lerp_color(c1, c2, t):
return tuple(int(c1[i] + (c2[i] - c1[i]) * t) for i in range(3))
def _step_to_color(step, total_steps):
"""
冷淡风渐变: 灰蓝 → 灰紫 → 冷蓝
"""
# start = _hex_to_rgb("#bfdefc") # 浅钢蓝
# mid = _hex_to_rgb("#aaa1d3") # 灰紫
# end = _hex_to_rgb("#888794") # 冷蓝灰
# start = _hex_to_rgb("#FCF1D0") # 浅钢蓝
# mid = _hex_to_rgb("#DBFFDD") # 灰紫
# end = _hex_to_rgb("#C8E0F9") # 冷蓝灰
start = _hex_to_rgb("#D5B4EC") # 浅钢蓝
mid = _hex_to_rgb("#879DF8") # 灰紫
end = _hex_to_rgb("#81CDF9") # 冷蓝灰
if total_steps <= 1:
return _rgb_to_hex(start)
mid_point = (total_steps - 1) / 2
if step <= mid_point:
t = step / mid_point
rgb = _lerp_color(start, mid, t)
else:
t = (step - mid_point) / mid_point
rgb = _lerp_color(mid, end, t)
return _rgb_to_hex(rgb)
# def _step_to_color(step, total_steps):
# # 红色 (255, 0, 0) → 蓝色 (0, 0, 255)
# r_start, g_start, b_start = (255, 0, 0)
# r_end, g_end, b_end = (0, 0, 255)
# t = step / max(total_steps - 1, 1) # 归一化到 [0, 1]
# r = int(r_start + (r_end - r_start) * t)
# g = int(g_start + (g_end - g_start) * t)
# b = int(b_start + (b_end - b_start) * t)
# return f"#{r:02x}{g:02x}{b:02x}"
# def _step_to_color(step, total_steps):
# """
# 从红 → 紫 → 蓝的平滑渐变
# """
# if total_steps <= 1:
# return "#ff0000" # 只有一个节点时直接红色
# mid_point = (total_steps - 1) / 2
# if step <= mid_point:
# # 红(0°) → 紫(300°)
# h_start, h_end = 0 / 360, 300 / 360
# t = step / mid_point
# else:
# # 紫(300°) → 蓝(240°)
# h_start, h_end = 300 / 360, 240 / 360
# t = (step - mid_point) / mid_point
# # 饱和度和亮度固定高值
# s, l = 1.0, 0.5
# h = h_start + (h_end - h_start) * t
# r, g, b = colorsys.hls_to_rgb(h, l, s)
# return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
# 生成 PyVis 图
net = Network(height="800px", width="100%", directed=True)
net.force_atlas_2based()
net.toggle_physics(True)
net.set_options("""
{
"physics": {
"forceAtlas2Based": {
"springLength": 300,
"springConstant": 0.01
},
"minVelocity": 0.75,
"solver": "forceAtlas2Based"
}
}
""")
for idx, op_node in enumerate(self.op_nodes_list):
node_color = _step_to_color(idx, len(self.op_nodes_list))
net.add_node(
n_id=id(op_node),
label=_get_op_node_str(op_node, step=idx),
title=_get_op_node_title(op_node),
color=node_color,
shape="box"
)
for op_node in self.op_nodes_list:
for output_key_nodes in op_node.output_keys_nodes.values():
for ptr_key_node in output_key_nodes.ptr:
target_node = self.op_nodes_list[ptr_key_node.index]
if hide_no_changed_keys and op_node==self.op_nodes_list[0] and target_node==self.op_nodes_list[-1]:
# hide the keys that are not changed from input dataset to first operator
continue
net.add_edge(
source=id(op_node),
to=id(target_node),
label=ptr_key_node.key,
color="gray"
)
# Timestamped filename to avoid overwriting
ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
os.makedirs(".pyvis", exist_ok=True)
output_html = os.path.abspath(os.path.join(".pyvis" ,f"operators_graph_{ts}.html"))
net.save_graph(output_html)
# Automatically delete the file on exit (whether normal exit or Ctrl-C)
def _cleanup():
try:
if os.path.exists(output_html):
os.remove(output_html)
print(f"🧹 Deleted temp file: {output_html}")
except Exception as e:
print(f"Failed to clean up file: {e}")
atexit.register(_cleanup)
# Select port
if port == 0:
sock = socket.socket()
sock.bind(('', 0))
port = sock.getsockname()[1]
sock.close()
# Start HTTP service (main thread blocking)
class SilentHandler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass
# Change to the directory where the file is located so the static service can find it
orig_cwd = os.getcwd()
try:
serve_dir = os.path.dirname(output_html) or "."
os.chdir(serve_dir)
url = f"http://localhost:{port}/{os.path.basename(output_html)}"
print(f"✅ Graph generated, access it at: {url}")
with HTTPServer(('0.0.0.0', port), SilentHandler) as httpd:
print(f"HTTP service started, listening on port {port} (Ctrl-C to exit, HTML file will be deleted on exit)")
try:
webbrowser.open(url)
httpd.serve_forever()
except KeyboardInterrupt:
print("\n❌ Interrupt signal received, exiting and cleaning up file...")
except Exception as e:
print(f"❌ Failed to start HTTP service: {e}")
self.logger.error(f"Failed to start HTTP service: {e}")
finally:
os.chdir(orig_cwd)
# atexit 会负责删除文件,这里无需重复删除
# # 保存 HTML
# output_html = "operators_graph.html"
# net.save_graph(output_html)
# # 选择端口
# if port == 0:
# sock = socket.socket()
# sock.bind(('', 0))
# port = sock.getsockname()[1]
# sock.close()
# # 启动 HTTP 服务(主线程)
# class SilentHandler(SimpleHTTPRequestHandler):
# def log_message(self, format, *args):
# pass
# os.chdir(os.path.dirname(os.path.abspath(output_html)))
# url = f"http://localhost:{port}/{output_html}"
# print(f"✅ 图已生成,访问: {url}")
# webbrowser.open(url)
# # 阻塞运行直到 Ctrl-C
# try:
# with HTTPServer(('0.0.0.0', port), SilentHandler) as httpd:
# print(f"HTTP 服务已启动,监听端口 {port},按 Ctrl-C 退出")
# httpd.serve_forever()
# except KeyboardInterrupt:
# print("\n❌ 已退出可视化服务")
# def _build_serving_resources_map(self):
# for op_runtime in self.op_runtimes:
# for _, v in vars(op_runtime.op).items():
# if isinstance(v, LLMServingABC):
# self.serving_resources[op_runtime.op]["LLMServingABC"] = v
# self.serving_reference_count[v] += 1
def _compiled_forward(self):
# for loop for each op and its `storage` status
for op_node in self.op_nodes_list:
self.logger.debug(f"Ready to run {op_node}, with serving={op_node.llm_serving}, active_llm_serving={self.active_llm_serving}")
if op_node.llm_serving != None:
if self.active_llm_serving and self.active_llm_serving is not op_node.llm_serving:
self.logger.debug(f"Detected active LLM Serving {self.active_llm_serving}, new serving {op_node.llm_serving}, cleaning up...")
self.active_llm_serving.cleanup()
self.active_llm_serving = op_node.llm_serving
op_node.op_obj.run(
storage=op_node.storage,
**op_node.kwargs
)
if op_node.llm_serving != None:
self.llm_serving_counter[self.active_llm_serving] -= 1
if self.llm_serving_counter[self.active_llm_serving] == 0:
self.logger.debug(f"Detected LLM Serving {self.active_llm_serving} ref reduced to 0, cleaning up...")
self.active_llm_serving.cleanup()
self.active_llm_serving = None
from .Pipeline import PipelineABC
__all__ = [
'PipelineABC',
]
\ No newline at end of file
from __future__ import annotations
from dataflow.core import OperatorABC
from dataflow.core import WrapperABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OPERATOR_CLASSES, LLM_SERVING_CLASSES
from typing import Union
class KeyNode(object):
def __init__(
self,
key_para_name: str,
key: str,
ptr: list[KeyNode] = None
):
self.key_para_name = key_para_name # name of the parameter in the operator's run functio
self.key = key
self.ptr = ptr if ptr != None else [] # ptr to next KeyNode(s), used to build a list of keys
def set_index(self, index:int):
self.index = index
def __str__(self):
current_id = hex(id(self))
ptr_status = [
(node.key, node.index, hex(id(node))) for node in self.ptr
] if len(self.ptr) != 0 else ["None"]
ptr_str = "".join([
f"\n <{item}>" for item in ptr_status
])
return f"\n KeyNode[{current_id}](key_para_name={self.key_para_name}, key={self.key}, ptr_keys={ptr_str})"
def __repr__(self):
return self.__str__()
class OperatorNode(object):
def __init__(
self,
op_obj: OPERATOR_CLASSES = None,
op_name: str = None,
storage: DataFlowStorage = None,
llm_serving: LLM_SERVING_CLASSES = None,
**kwargs
):
self.op_obj = op_obj
self.op_name = op_name
self.storage = storage # will be set when the operator is initialized
self.llm_serving = llm_serving # will be set when the operator is initialized
self.kwargs = kwargs # parameters for the operator's run function
# Initialize input and output keys
self.input_keys = []
self.input_key_nodes: dict[KeyNode] = {}
self.output_keys = []
self.output_keys_nodes: dict[KeyNode] = {}
self._get_keys_from_kwargs() # Extract keys from kwargs
def _get_keys_from_kwargs(self):
for k, v in self.kwargs.items():
if k.startswith("input_") and isinstance(v, str):
self.input_keys.append(v)
self.input_key_nodes[v] = KeyNode(k, v)
elif k.startswith("output_") and isinstance(v, str):
self.output_keys.append(v)
self.output_keys_nodes[v] = KeyNode(k, v)
else: # warning for unexpected keys with red color
print(f"\033[91mWarning: Unexpected key '{k}' in operator {self.op_obj.__class__.__name__}\033[0m")
def init_output_keys_nodes(self, keys:list[str]):
for key in keys:
self.output_keys.append(key)
self.output_keys_nodes[key]= KeyNode(key,key)
def init_input_keys_nodes(self, keys:list[str]):
for key in keys:
self.input_keys.append(key)
self.input_key_nodes[key] = KeyNode(key, key)
def __str__(self):
op_class = self.op_obj.__class__.__name__ if self.op_obj else "None"
input_keys_str = ', '.join(self.input_keys)
output_keys_str = ', '.join(self.output_keys)
return (
f"OperatorNode(\n"
f" Operator_class: {op_class},\n"
f" Operator_name: {self.op_name},\n"
f" Input Keys: [{input_keys_str}],\n"
f" Output Keys: [{output_keys_str}],\n"
f" Input Nodes: [{self.input_key_nodes}],\n"
f" Output Nodes: [{self.output_keys_nodes}],\n"
f" Additional Params: {self.kwargs}\n"
f")"
)
from .reasoning import *
\ No newline at end of file
import json
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
"""
A collection of prompts for the AgenticRAG pipelines operator
"""
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorGetIdentifierPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get identifier.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
You need to extract the content_identifier from question. Here's how:
1. For each question, identify the main subject/noun phrase that the question is about
2. This should typically be:
- Proper nouns (names, titles)
- Specific technical terms
- Unique identifiers in the question
Examples:
{
"question": "What is the third movie in the Avatar series?",
"content_identifier": "Avatar series"
},
{
"question": "龙美术馆2025年展览展览时间范围是什么",
"content_identifier": "龙美术馆"
}
Return JSON format with key "content_identifier"
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
Now process this question:{input}
'''
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorGetConlcusionPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get initial conclusion.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
# Conclusion Extraction and Relationship Generation Specifications
## I. Input/Output Requirements
**Input**: Any document fragment
**Output**: JSON array where each element contains `conclusion` and `R` fields
## II. Conclusion Extraction Rules
1. **Atomicity**
- Each conclusion must be an indivisible basic fact
- ✖ Prohibited combined conclusions: "A increased by 5% and B decreased by 2%" → Should be split into two conclusions
2. **Verifiability**
- Must contain at least one definite identifier:
✓ Numeric value (59.0%)
✓ Time (2025/04/28)
✓ Unique name (Humpback65B)
- ✖ Reject vague expressions: "Performance has improved"
3. **Timeliness Handling**
- Explicitly mark time ranges when containing time-sensitive information
- Examples:
✓ "Global GDP grew by 3.0% in 2023"
✖ "Recent GDP growth of 3.0%"
4. **Citation Integrity**
- If a conclusion cites other content (e.g., "as stated in (2)"), the complete content of (2) must be embedded in the conclusion
## III. Relationship (R) Generation Standards
### Attribute Requirements
- **Structured**: Use semicolons to separate multi-metrics (Example 3)
- **Operational**: Directly usable for database queries or calculations
✓ "City with the highest temperature"
✖ "Conclusions about temperature"
### Generation Templates
| Conclusion Type | R Template | Example |
|-------------------------|---------------------------------------|---------------------------------|
| Single Numeric Result | "[Indicator Name]" | A: "59.0%" → R: "Accuracy" |
| Comparative Conclusion | "[Indicator] compared to [baseline] in [change dimension]" | A: "4.2% higher than baseline" → R: "Improvement in accuracy compared to baseline" |
| Multi-dimensional Result| "[Primary Indicator] and its [sub-dimension] distribution" | A: "Average 59% (Humanities 65.6%)" → R: "Average accuracy and subject distribution" |
## IV. Output Specifications and Examples
[
{
"conclusion": "Humpback65B achieved a zero-shot accuracy of 59.0% in the MMLU evaluation",
"R": "Humpback65B's zero-shot accuracy"
},
{
"conclusion": "On 2025/04/28, the closing price of XL Er Nantes-U was $11.34 (up 14.0%)",
"R": "Closing price and percentage increase of XL Er Nantes-U on 2025/04/28"
},
{
"conclusion": "90% of 27 million metric tons",
"R": "Proportion of new global LNG supply from North America in 2025"
},
{
"conclusion": "Abstract",
"R": "Indexed part of Springer articles in databases"
},
{
"conclusion": "2024-03-06",
"R": "Publication date of Psychology Top 100 of 2023"
},
{
"conclusion": "2018-01",
"R": "Collection date of 'The Importance of Referencing - PMC'"
},
{
"conclusion": "30-40%",
"R": "Percentage of science report dedicated to results section"
},
{
"conclusion": "$500 billion",
"R": "Projected economic contribution of hybrid work models by 2030"
},
{
"conclusion": "650,000+",
"R": "Number of youth insights in India Skills Report 2025"
},
{
"conclusion": "July 2024 issue",
"R": "Consumer Reports publication in July 2024"
},
{
"conclusion": "16th annual, 2024-12",
"R": "Edition and publication date of Deloitte's Tech Trends 2025"
},
{
"conclusion": "January 2024 issue",
"R": "Consumer Reports publication in January 2024"
},
{
"conclusion": "December 2021 issue",
"R": "Consumer Reports publication in December 2021"
},
{
"conclusion": "November 2021 issue",
"R": "Consumer Reports publication in November 2021"
},
{
"conclusion": "14",
"R": "Death count in listeria outbreak linked to frozen shakes"
},
{
"conclusion": "$122 million",
"R": "Mega Millions jackpot amount for May 16"
},
{
"conclusion": "32",
"R": "Number of consecutive years United Way met its goal"
},
{
"conclusion": "62%",
"R": "Percentage increase in Chemical Sciences article submissions (2014-2021)"
},
{
"conclusion": "11 pounds of fish",
"R": "Fish trade for Europa League semifinal ticket"
},
{
"conclusion": "2-1",
"R": "PSG vs. Arsenal match result (Champions League)"
}
]
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
The document content to be processed is as follows: {input}
'''
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorQuestionPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get initial question.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''Your task is to generate a corresponding question (Q) based on the given task identifier (ID), relationship (R), and answer (A).
Input/Output Specifications:
Input:
- ID: Data source or query scope
- R: Logical relationship for extracting the answer from the data
- A: Known correct answer
Output:
- Must be in strict JSON format: {"Q": "generated question"}
- No explanations or extra fields allowed
Q must satisfy:
1. Be a complete natural language question
2. Allow deriving answer A by applying R after accessing context via ID
Question Generation Principles:
1. Exact correspondence - Each question must fully base on the original conclusion, with the answer being its core content.
2. Derivability - The original conclusion must be directly derivable from the question and be the only correct answer.
3. Self-containment - Questions must be complete and independent, not relying on external references or unspecified context.
4. Information hiding - Do not reveal specific sources or data paths, but can include search hints.
5. Specificity and clarity - Questions should include details like specific times to ensure unique answers.
6. Single question - Generate only one question per conclusion.
7. If the conclusion can only be obtained from input content, include hints via data source identifiers in the question.
8. Language consistency - The language of each question must be the same as the conclusion's language.
Examples:
Input:
ID: Global daily maximum temperatures
R: City with the highest temperature
A: xx City
Output: {"Q": "What is the city with the highest temperature in global daily maximum temperatures?"}
Only output JSON without additional content.
'''
return system_prompt
def build_prompt(self, identifier, conclusion, relation) -> str:
prompt = f'''
Data to be Processed:
ID: {identifier}
R: {relation}
A: {conclusion}
'''
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorCleanQAPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to clean QA.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''Processing Rules:
1. Extract ONLY the exact information requested in the question
2. Preserve the original index numbering
3. Never omit essential information
4. Standardize all numerical formats:
- Percentages: 8% (not "8percent" or "eight percent")
- Numbers: Use commas for thousands (3,045)
- Currency: $1,000 (not "1000 dollars")
- Dates: YYYY-MM-DD format
- Units: include (5kg, 10cm, etc.)
Example:
{
"question": "How many travel trends for 2022 does '2025 Annual Travel Trends Report' present?",
"original_answer": "The Neo4j graph database was used to organize 3,045 Raman spectra of exosomes.",
"refined_answer": "3,045"
}
Required JSON format:
{
"question": str,
"original_answer": str,
"refined_answer": str
}
Key requirements:
- Be extremely concise in refined_answer
- Never add information not present in original_answer
- Preserve all numerical values exactly
- If question asks for specific data, extract only that data
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
The data need to be processed is as follows: {input}
'''
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorAnswerPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get LLM's answer.
'''
def __init__(self):
pass
def build_prompt(self, input) -> str:
prompt = f'''Please solve the following problem and return as many relevant results as possible that meet the query requirements.\n Ensure responses are as concise as possible, focusing only on key information while omitting redundant details.\n The task is:\n {input}
'''.strip()
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorRecallScorePrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get recall score.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Evaluate the consistency of the core content of the golden answer and the other answer
# Scoring Criteria
1) 2 points: the information between the golden answer and the other answer completely consistent, although the expression methods can be different.
2) 1 point: the other answer contains all the information of the golden answer but has additional valid information.
3) 0 point: the other answer lacks the necessary key information of the golden answer, or there are contradictions in both the information.
# Examples:
1) Examples for 2 points:
1.1) two answers are completely consistent:
- Golden answer: Interest rates should be raised and inflation should be monitored.
- Other answer: It is necessary to raise interest rates and monitor inflation.
2) Examples for 1 point:
2.1) the other answer contains all the information of the golden answer and adds extra useful information:
- Golden answer: The interest rates should be raised.
- Other answer: The interest rates should be raised and inflation should be monitored.
3) Examples for 0 point:
3.1) the other answer lacks the key information of the golden answer:
- Golden answer: The interest rates should be raised and inflation should be monitored.
- Other answer: The interest rates should be raised.
3.2) the other answer has contradictions:
- Golden answer: Interest rates should be raised by 50 basis points.
- Other answer: Interest rates should be raised by 25 basis points.
# the output should be in JSON format as required without any irrelevant content
{
"answer_analysis":"give out the reason on how to score the llm_answer",
"answer_score":0/1/2
}
'''
return system_prompt
def build_prompt(self, golden_answer, llm_answer) -> str:
prompt = f'''
The inputs are as follows:
Golden Answer: {golden_answer}
Other Answer: {llm_answer}
'''
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorOptionalAnswerPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get optional answer.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = """
You are an expert in **linguistic variation** and **data augmentation**. Your task is to generate a comprehensive list of all plausible and commonly recognized alternative expressions, formats, and aliases for a given input entity or piece of information. The goal is to create high-quality training data that captures diverse ways of referring to the same concept.
**Key Guidelines:**
1. **Equivalence:** Each alternative expression must refer to *exactly the same entity or information* as the original input. Do not include broader categories, narrower sub-types, or related but distinct concepts.
2. **Scope of Variation:** Focus on:
Different **formatting conventions** (e.g., dates, numbers, units).
Common **abbreviations, acronyms, or initialisms**.
Well-known **aliases, nicknames, or shorter forms** in common usage.
Synonyms or rephrasing should *only* be included if they are direct, commonly accepted equivalents.
3. **Context-Agnosticism:** Unless the input itself implies a specific context, generate general-purpose variations. Avoid creating variations that are only valid in very niche or obscure contexts.
4. **Inclusion of Original:** Always include the original input as the first item in the generated list.
5. **Format:** Output the variations as a JSON list of strings.
**Examples:**
Input: 1977-01-26
Output: ["1977-01-26", "1977 01 26", "1977.01.26", "January 26, 1977", "26 Jan 1977", "Jan 26, 1977"]
Input: United Nations
Output: ["United Nations", "U.N.", "UN"]
Input: 3.14159
Output: ["3.14159", "π", "pi", "PI"]
Input: Doctor of Philosophy
Output: ["Doctor of Philosophy", "Ph.D.", "PhD", "Doctorate"]
Input: New York City
Output: ["New York City", "NYC", "The Big Apple"]
Input: kilogram
Output: ["kilogram", "kg", "kilograms"]
"""
return system_prompt
def build_prompt(self, answer) -> str:
prompt = f"""
The original answer is: {answer}
Please list all possible textual expressions that have the same meaning or refer to the same entity, especially in different formats (e.g., dates, names, abbreviations).
Respond with a JSON list of strings. Do not explain.
"""
return prompt
@PROMPT_REGISTRY.register()
class AtomicTaskGeneratorGoldenDocAnswerPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get LLM's answer with golden doc.
'''
def __init__(self):
pass
def build_prompt(self, golden_doc, question) -> str:
prompt = f"""You are given the following document that contains relevant information to help answer a question.
Document:
\"\"\"
{golden_doc}
\"\"\"
Question:
{question}
Please answer the question using ONLY the information in the provided document. Return the final answer directly, with no explanation.
"""
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorGetIdentifierPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get identifier.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
You need to extract the content_identifier from question. Here's how:
1. For each question, identify the main subject/noun phrase that the question is about
2. This should typically be:
- Proper nouns (names, titles)
- Specific technical terms
- Unique identifiers in the question
Examples:
{
"question": "What is the third movie in the Avatar series?",
"content_identifier": "Avatar series"
},
{
"question": "龙美术馆2025年展览展览时间范围是什么",
"content_identifier": "龙美术馆"
}
Return JSON format with key "content_identifier"
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
Now process this question:{input}
'''
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorBackwardTaskPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get backward task.
'''
def __init__(self):
pass
def build_prompt(self, input) -> str:
prompt = f'''
Conduct divergent searches based on the input element to find an appropriate superset related to its attributes, and elaborate on the relationship between the superset and the element (mine for special and uniquely pointing relationships to ensure that the superset + relationship does not mislead to other subsets). Example supersets include:
1. The superset of a paragraph or sentence can be the text content it belongs to.
2. The superset of a specific term can be its corresponding discipline or category.
3. The superset of a specific date can be any date range containing it, such as the week or month it belongs to.
4. The superset of a short event can be the complete specific event it belongs to.
5. The superset of a page can be other pages referencing it or its parent page.
6. Only generate one relationship, and the content of the relationship should preferably not include strongly specific proper nouns.
Optional expressions for relationships:
1. Clearly express hierarchical or ownership relationships. If the input is a sub-item of a series of works, the relation should indicate its position; if the input is a part of a superset, the relation should clarify its ownership.
2. Provide the specific positioning of the input content, such as time range, field of paper publication, or specific role in the superset.
3. Wording should conform to the research field or industry standards of the input content.
4. Only provide necessary association information to avoid irrelevant content. Good example: "This study is part of the IRAM NOEMA Large Program research collection". Bad example: "This study is a very important research conducted by many scientists and has produced very meaningful results" (verbose and containing subjective evaluations).
Note:
1. Please return the identifier of the superset content, such as attribute name, web page title, paper title, etc., which uniquely locates the superset content.
2. The content of the superset needs to be obtained through tool invocation, which can be specific web content, PDF text, or image understanding content.
3. Please clearly describe the relationship between the superset content and the input element, that is, list the qualification conditions from the superset content to ensure that the conditions uniquely point to the input element, and the description of the conditions should be concise.
4. Use a maximum of 3 search keywords per search; if more than 3 keywords are needed, perform multiple searches separately.
5. The obtained identifier should preferably be derived from search results and not include the input content.
6. If the input is a PDF document, give priority to invoking tools to read the document content.
Return format requirements: Please return the result in JSON format with keys 'identifier': str (identifier) and 'relation': str (relationship).
Here are some reference input-output examples:
Example1:
Input: Avatar 3: Fire and Ash
identifier: Avatar film series
relation: The third film
Example2:
Input: The 15 social media trends that will shape your 2025 strategy
identifier: Hootsuite blog end of 2024
relation: The authoritative trends report published by Hootsuite to guide social media strategy development
Example3:
Input: SOLIS (Seeds of Life In Space) project
identifier: NOEMA Large Program
relation: A sub-project within NOEMA's specific large observation program related to research on the existence of life in the universe.
Example4:
Input: SOLIS. XIX. The chemically rich SVS13-B protostellar jet
identifier: IRAM NOEMA Large Program research collection
relation: One of the imaged enriched molecular jet samples in the IRAM NOEMA Large Program research collection, specifically imaged and analyzed for molecular distribution and composition within the collection, uniquely locatable via observation data on SVS13-B in the collection.
Example5:
Input: AdCare -VLM: Leveraging Large Vision Language Model (LVLM) to Monitor Long-Term Medication Adherence and Care
identifier: A Survey of State of the Art Large Vision Language Models: Alignment, Benchmark, Evaluations and Challenges
relation: A paper that introduces advancements in large vision language models in A Survey of State of the Art Large Vision Language Models: Alignment, Benchmark, Evaluations and Challenges, covering models including the LVLM described in the input paper.
Example6:
Input: Immigration is a higher priority for Americans in 2025: AP-NORC poll | AP News
identifier: 2025 policy priorities report for AAPI communities
relation: The poll results about shifting immigration priorities featured in AP News and referenced in AAPI policy reports
Example7:
Input: X-ray Absorption Spectroscopy (XAS) database for iron-containing proteins (arXiv:2504.18554)
identifier: iron-binding proteins database
relation: The specialized database that collects XAS data specifically for proteins containing iron
Example8:
Input: live-action 'Snow White' movie controversy
identifier: Disney animated film adaptation
relation: The controversial live-action movie adapted from a Disney animated film featuring the main character Snow White
Example9:
Input: Evaluating the evidence: a systematic review of reviews of the effectiveness and safety of digital interventions for ADHD | BMC Psychiatry | Full Text
identifier: BMC Psychiatry journal 2025 publications
relation: The full-text systematic review about digital ADHD interventions published in BMC Psychiatry
Example10:
Input: Enron Corporation
identifier: 2001 Fortune Global 500 energy industry rankings
relation: The company that ranked first in revenue in the energy sector according to the 2001 Fortune Global 500 rankings
Current input:
{input}
'''
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorSupersetCheckPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to check superset.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
**Task**: Validate if a given "superset" can uniquely identify a "subset" based on the provided "relationship".
**Rules**:
1. **Superset-Subset Relationship**:
- The "superset" must be a true generalization of the "subset" (e.g., "Animal" is a valid superset of "Dog").
- The "superset" CANNOT be a synonym of the "subset" (e.g., "Car" and "Automobile" are invalid).
2. **Relationship Validity**:
- The relationship must **explicitly and uniquely** link the superset to the subset.
- It CANNOT be a **many-to-one mapping**.
**Output Format**:
Return a JSON with the key `new_query`. The value should be:
- `"valid"` if the superset and relationship can uniquely locate the subset.
- `"invalid"` otherwise.
**Example Valid Output**:
{"new_query": "valid"}
'''
return system_prompt
def build_prompt(self, new_id, relation, identifier) -> str:
prompt = f'''
Given superset: {new_id}\n
Given relationship: {relation}\n
Given subset: {identifier}\n
'''
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorQuestionPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get question.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Please generate a question based on the content of the input identifier, a certain answer, and a certain relationship (this relationship is the relationship between the content of the file corresponding to the identifier and the given answer), such that
The answer to this question is the input answer.
The content of this question is determined by the content of the identifier and the content of the given relationship.
The generated question should not involve the content of the input answer.
Please return it in JSON format, with the key of the JSON being new_query.
'''
return system_prompt
def build_prompt(self, new_id, relation, identifier) -> str:
prompt = f'''
Certain answer: {identifier}\n
Identifier: {new_id}\n
Relationship: {relation}\n
'''
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorAnswerPrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get LLM's answer.
'''
def __init__(self):
pass
def build_prompt(self, input) -> str:
prompt = f'''
Please solve the following problem and return as many relevant results as possible that "
"meet the query requirements. Ensure responses are as concise as possible, focusing only "
"on key information while omitting redundant details."
"Please return the result in JSON format with keys 'answer_list': List[str] the list of answers."
"\n\n"
"The task is: \n
{input}
'''.strip()
return prompt
@PROMPT_REGISTRY.register()
class DepthQAGeneratorRecallScorePrompt(PromptABC):
'''
The prompt for the AtomicTaskGenerator to get recall score.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Evaluate the consistency of the core content of the golden answer and the other answer
# Scoring Criteria
1) 2 points: the information between the golden answer and the other answer completely consistent, although the expression methods can be different.
2) 1 point: the other answer contains all the information of the golden answer but has additional valid information.
3) 0 point: the other answer lacks the necessary key information of the golden answer, or there are contradictions in both the information.
# Examples:
1) Examples for 2 points:
1.1) two answers are completely consistent:
- Golden answer: Interest rates should be raised and inflation should be monitored.
- Other answer: It is necessary to raise interest rates and monitor inflation.
2) Examples for 1 point:
2.1) the other answer contains all the information of the golden answer and adds extra useful information:
- Golden answer: The interest rates should be raised.
- Other answer: The interest rates should be raised and inflation should be monitored.
3) Examples for 0 point:
3.1) the other answer lacks the key information of the golden answer:
- Golden answer: The interest rates should be raised and inflation should be monitored.
- Other answer: The interest rates should be raised.
3.2) the other answer has contradictions:
- Golden answer: Interest rates should be raised by 50 basis points.
- Other answer: Interest rates should be raised by 25 basis points.
# the output should be in JSON format as required without any irrelevant content
{
"answer_analysis":"give out the reason on how to score the llm_answer",
"answer_score":0/1/2
}
'''
return system_prompt
def build_prompt(self, golden_answer, llm_answer) -> str:
prompt = f'''
The inputs are as follows:
Golden Answer: {golden_answer}
Other Answer: {llm_answer}
'''
return prompt
@PROMPT_REGISTRY.register()
class WidthQAGeneratorMergePrompt(PromptABC):
'''
The prompt for the WidthQAGenerator to merge prompt.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
# Comprehensive Task Guide for Research Questions
## Core Objective:
Intelligently merge 2-3 related research questions into high-quality comprehensive questions while maintaining the integrity and accuracy of the original content.
## Input Requirements:
- Each question includes: index (unique ID), question (question text), golden_answer (standard answer), content_identifier (content identifier)
## Grouping Specifications:
### Grouping Strategies:
1. **Content Matching Principle**:
- Priority: Merge questions with similar themes
2. **Quantity Control**:
- Each group must contain 2-3 original questions
- Ensure all original questions are grouped (no omissions)
### Standards for Question Synthesis:
1. **Content Integrity**:
- Retain all elements of the original questions
- Do not add new facts or assumptions
- Completely preserve time-related elements in their original form
2. **Question Quality**:
- Clear and unambiguous expression
- Logically coherent merged questions
- Do not imply any solution methods
3. **Structural Requirements**:
- Form complete interrogative sentences (not simply connected with "and")
- Correct grammatical structure
- Preserve professional terminology in its original form
## Output Specifications:
[
{
"question": "Text of the synthesized question",
"index": [1,2,3], // Original indices
"content_identifier": "Original content identifier"
}
]
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
Here are the base questions to process:
{json.dumps(input, indent=2, ensure_ascii=False)}
Each dictionary contains: index (unique ID), question (original question), and content_identifier (identifier).
'''
return prompt
@PROMPT_REGISTRY.register()
class WidthQAGeneratorOriginCheckPrompt(PromptABC):
'''
The prompt for the WidthQAGenerator to check origin.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Task Instructions:
Verify if complex questions can be properly decomposed into their original questions.
Return state=1 if all conditions are met, state=0 otherwise:
Conditions for state=1:
1. The complex question clearly contains all elements from original questions
2. No information distortion or ambiguity introduced
3. Logical relationships between original questions are properly maintained
For example:
"index": 1
"Complex Question": "In the Academia Insider article 'The best AI tools for research papers and academic research (Literature review, grants, PDFs and more)', how does Semantic Scholar enhance literature review efficiency? Who are the two contributors—one with a Master’s and Ph.D. in Chemistry from the UK and Australia, and the other a Ph.D. student at Simon Fraser University (SFU)—credited with contributing academic insights and initiating the list of AI research tools, respectively?"
"Original Questions": [
"According to 'The best AI tools for research papers and academic research (Literature review, grants, PDFs and more) - Academia Insider', how does Semantic Scholar enhance literature review efficiency?",
"In the Academia Insider article 'The best AI tools for research papers and academic research (Literature review, grants, PDFs and more)', who is the contributor with a Master’s and Ph.D. in Chemistry from the UK and Australia and extensive research experience?",
"In the Academia Insider article 'The best AI tools for research papers and academic research (Literature review, grants, PDFs and more)', who is the contributor credited with helping to start the list of AI research tools?"
]
The above complex question can be decomposed into these original questions without deviation in content, and the status is returned as 1.
"index": 2
"Complex Question": "Based on the trends reported in the 2025 scientific publications of the Academy of Articles and the information on open and free content from the JSTOR and Artstor 'About JSTOR' page, when does research on protecting cultural and linguistic diversity through AI reach its peak? What is the total number of research reports available, and how many policy institutes are represented in the collection?"
"Original Questions": [
"According to the 2025 scientific publication trends of the Academy of Articles, when does research on protecting cultural and linguistic diversity through AI reach its peak?",
"According to the information on open and free content from the JSTOR and Artstor 'About JSTOR' page, what is the total number of research reports in the collection? How many policy institutes are covered?"
]
The above complex question cannot be decomposed into original questions because the direction of the questions in the complex question is confusing and ambiguous, and the status is returned as 0.
Example Output:
[{
"index": 1,
"complex_question": "original complex question",
"state": 1
}]
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
Here are the base questions to process:
{json.dumps(input, indent=2, ensure_ascii=False)}
Each dictionary contains: index (unique ID), complex_question (original complex question),
and original_questions (list of original questions).
'''
return prompt
@PROMPT_REGISTRY.register()
class WidthQAGeneratorQuestionVerifyPrompt(PromptABC):
'''
The prompt for the WidthQAGenerator to verify question.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Answer the provided complex research questions based on your knowledge.
For each question, provide your answer.
Output JSON format:
[{
"index": 1 // original question indices
"complex_question": original complex question,
"llm_answer"://your answer
},
{
"index": 2 // original question indices
"complex_question": original complex question,
"llm_answer"://your answer
}]
'''
return system_prompt
def build_prompt(self, input) -> str:
prompt = f'''
Please answer these research questions:
{json.dumps(input, indent=2, ensure_ascii=False)}
'''
return prompt
@PROMPT_REGISTRY.register()
class WidthQAGeneratorAnswerPrompt(PromptABC):
'''
The prompt for the WidthQAGenerator to get LLM's answer.
'''
def __init__(self):
pass
def build_prompt(self, input) -> str:
prompt = f'''
Please solve the following problem and return as many relevant results as possible that "
"meet the query requirements. Ensure responses are as concise as possible, focusing only "
"on key information while omitting redundant details."
"Please return the result in JSON format with keys 'answer_list': List[str] the list of answers."
"\n\n"
"The task is: \n
{input}
'''.strip()
return prompt
@PROMPT_REGISTRY.register()
class WidthQAGeneratorRecallScorePrompt(PromptABC):
'''
The prompt for the WidthQAGenerator to get recall score.
'''
def __init__(self):
pass
def build_system_prompt(self) -> str:
system_prompt = '''
Evaluate the consistency of the core content of the golden answer and the other answer
# Scoring Criteria
1) 2 points: the information between the golden answer and the other answer completely consistent, although the expression methods can be different.
2) 1 point: the other answer contains all the information of the golden answer but has additional valid information.
3) 0 point: the other answer lacks the necessary key information of the golden answer, or there are contradictions in both the information.
# Examples:
1) Examples for 2 points:
1.1) two answers are completely consistent:
- Golden answer: Interest rates should be raised and inflation should be monitored.
- Other answer: It is necessary to raise interest rates and monitor inflation.
2) Examples for 1 point:
2.1) the other answer contains all the information of the golden answer and adds extra useful information:
- Golden answer: The interest rates should be raised.
- Other answer: The interest rates should be raised and inflation should be monitored.
3) Examples for 0 point:
3.1) the other answer lacks the key information of the golden answer:
- Golden answer: The interest rates should be raised and inflation should be monitored.
- Other answer: The interest rates should be raised.
3.2) the other answer has contradictions:
- Golden answer: Interest rates should be raised by 50 basis points.
- Other answer: Interest rates should be raised by 25 basis points.
# the output should be in JSON format as required without any irrelevant content
{
"answer_analysis":"give out the reason on how to score the llm_answer",
"answer_score":0/1/2
}
'''
return system_prompt
def build_prompt(self, golden_answer, llm_answer) -> str:
prompt = f'''
The inputs are as follows:
Golden Answer: {golden_answer}
Other Answer: {llm_answer}
'''
return prompt
\ No newline at end of file
from dataflow.core.prompt import PromptABC
from dataflow.utils.registry import PROMPT_REGISTRY
@PROMPT_REGISTRY.register()
class ExtractSmilesFromTextPrompt(PromptABC):
def __init__(self, prompt_template = None):
if prompt_template is None:
self.prompt_template = """Extract the monomer/small molecule information from the text and format it as a structured JSON object.
Follow these rules strictly:
1. For each monomer/small molecule, extract:
- abbreviation: The commonly used abbreviated name
- full_name: The complete chemical name
- smiles: The SMILES notation of the molecular structure
2. General rules:
- Each monomer/small molecule should have a unique abbreviation
- If a monomer's information is incomplete, include only the available information
- Don't recognize polymer which have "poly" in the name as monomer
Example output:
[
{
"abbreviation": "4-ODA",
"full_name": "4,4′-Oxydianiline",
"smiles": "O(c1ccc(N)cc1)c2ccc(cc2)N"
},
{
"abbreviation": "6FDA",
"full_name": "4,4'-(hexafluoroisopropylidene)diphthalic anhydride",
"smiles": "C1=CC2=C(C=C1C(C3=CC4=C(C=C3)C(=O)OC4=O)(C(F)(F)F)C(F)(F)F)C(=O)OC2=O"
}
]
Please make sure to output pure json which can be saved into a json file, do not output like html.
"""
else:
self.prompt_template = prompt_template
def build_prompt(self, target_monomers: str) -> str:
target_prompt = "\nHere give you some monomers' abbreviation or full name, please only extract the information of these monomers. This rule have priority over the other rules. Here are the specific monomers: " + str(target_monomers)
return self.prompt_template + target_prompt
\ No newline at end of file
from dataflow.core.prompt import PromptABC
from dataflow.utils.registry import PROMPT_REGISTRY
'''
A collection of prompts for the code operators.
'''
@PROMPT_REGISTRY.register()
class CodeQualityEvaluatorPrompt(PromptABC):
'''
The prompt for the code quality evaluator.
'''
def __init__(self):
pass
def build_prompt(self, instruction: str, code: str) -> str:
"""
Generate system prompt for code quality evaluation.
"""
prompt = (
"You are a meticulous and critical code reviewer. Your task is to evaluate the quality of the "
"provided 'Generated Code' based on the given 'Instruction'.\n\n"
"Provide a single integer score from 1 (poor) to 10 (excellent) and brief, constructive feedback. "
"Your entire response MUST strictly follow the format below.\n\n"
"Instruction: {instruction}\n\n"
"Generated Code:\n"
"```python\n"
"{code}\n"
"```\n\n"
"Evaluation Criteria:\n"
"1. **Correctness & Completeness**: Does the code accurately and fully implement the instruction? Does it handle obvious edge cases? Are all necessary imports included (e.g., List, Dict, Optional from typing, other required modules)?\n"
"2. **Clarity & Best Practices**: Is the code clean, readable, and does it follow standard conventions (e.g., PEP 8 for Python)?\n"
"3. **Efficiency**: Is the implementation reasonably efficient for the given task?\n\n"
"Format your response EXACTLY as follows:\n"
"Score: [integer score from 1 to 10]\n"
"Feedback: [your feedback here]"
)
return prompt.format(instruction=instruction, code=code)
@PROMPT_REGISTRY.register()
class CodeCodeToInstructionGeneratorPrompt(PromptABC):
'''
The prompt for the code to instruction generator.
'''
def __init__(self):
pass
def build_prompt(self, code: str) -> str:
"""
Generate system prompt for code to instruction generation.
"""
prompt = (
"You are an expert programmer and a clear communicator. Your task is to analyze the "
"provided code snippet and generate a single, concise, and natural human instruction "
"that could have produced this code.\n\n"
"The instruction should be a directive, like 'Write a function that...' or 'Create a class to...'. "
"Do NOT add any explanations, comments, or markdown formatting. Output only the instruction text.\n\n"
"Code Snippet:\n"
"```\n"
"{code}\n"
"```\n\n"
"Generated Instruction:"
)
return prompt.format(code=code)
@PROMPT_REGISTRY.register()
class CodeInstructionGeneratePrompt(PromptABC):
'''
The prompt for generating new instructions based on few-shot examples.
'''
def __init__(self):
pass
def build_prompt(self, few_shot_examples) -> str:
"""
Generate prompt for creating new instructions similar to the few-shot examples.
"""
examples_text = ""
for i, example in enumerate(few_shot_examples, 1):
examples_text += f"Example {i}:\n{example['instruction']}\n\n"
prompt = (
"You are tasked with generating a NEW programming instruction similar in difficulty and style to the provided examples.\n\n"
"Output MUST follow EXACTLY this format (no extra text before/after):\n"
"Please provide a self-contained Python script that solves the following problem in a markdown code block\n"
"```\\n"
"[optional imports if needed]\\n"
"\\n"
"\\n"
"def function_name(...)-> ReturnType:\\n"
" \"\"\" Problem description derived from the original instruction.\\n"
" Include input/output description and constraints if any.\\n"
" Provide at least one doctest example:\\n"
" >>> function_name(example_input)\\n"
" expected_output\\n"
" \"\"\"\\n"
"```\\n"
"GIVEN EXAMPLES:\n"
f"{examples_text}"
"REQUIREMENTS:\n"
"1. Generate ONE new instruction that is similar in difficulty and complexity to the examples above\n"
"2. Make it diverse - do not simply copy or slightly modify the examples\n"
"3. The instruction should be clear, specific, and solvable\n"
"4. Maintain similar level of detail and specificity as the examples\n"
"NEW INSTRUCTION:"
)
return prompt
@PROMPT_REGISTRY.register()
class CodeInstructionEnhancement(PromptABC):
'''
The prompt for instruction standardization and enhancement.
Converts original instructions into a standardized format with proper Python function templates.
'''
def __init__(self):
pass
def build_prompt(self, instruction: str) -> str:
"""
Generate system prompt for instruction normalization.
Only require the output instruction to be about a Python function.
"""
prompt = (
"Rewrite the ORIGINAL INSTRUCTION into a standardized English instruction + code block.\n"
"Output MUST follow EXACTLY this format (no extra text before/after):\n"
"Please provide a self-contained Python script that solves the following problem in a markdown code block\n"
"```\\n"
"[optional imports if needed]\\n"
"\\n"
"\\n"
"def function_name(...)-> ReturnType:\\n"
" \"\"\" Problem description derived from the original instruction.\\n"
" Include input/output description and constraints if any.\\n"
" Provide at least one doctest example:\\n"
" >>> function_name(example_input)\\n"
" expected_output\\n"
" \"\"\"\\n"
"```\\n"
"REQUIREMENTS:\n"
"1. The first line (sentence) must be exactly: Please provide a self-contained Python script that solves the following problem in a markdown code block\n"
"2. The code fence uses raw ``` (no language tag). Nothing outside the fence except the first sentence.\n"
"3. Inside the fence: optionally add needed imports (omit if unnecessary), then TWO blank lines, then ONE function.\n"
"4. Infer a concise snake_case function name from the original instruction.\n"
"5. Provide full type annotations for parameters and return value (use reasonable types; if uncertain use str / int / List[str] etc.).\n"
"6. The function body MUST contain ONLY the docstring (no pass, no implementation, no other statements).\n"
"7. Docstring must be English, multi-line, and include: problem description, input/output description, constraints (if any), and at least one doctest derived or plausibly inferred.\n"
"8. Do NOT add additional functions, classes, comments, blank sections, placeholders (no TODO, no ...).\n"
"9. Do NOT echo the original instruction verbatim if it contains formatting artifacts—clean it while preserving meaning.\n"
"10. Absolutely no extra explanatory text outside the specified output format.\n"
"ORIGINAL INSTRUCTION:\n{instruction}\n"
"Produce ONLY the final standardized instruction + code block per the rules."
)
return prompt.format(instruction=instruction)
@PROMPT_REGISTRY.register()
class CodeInstructionToCodeGeneratorPrompt(PromptABC):
'''
The prompt for the instruction to code generator.
'''
def __init__(self):
pass
def build_prompt(self, instruction: str) -> str:
"""
Generate system prompt for instruction to code generation.
"""
prompt = (
"You are a world-class coding assistant. Your task is to fulfill the following request precisely. "
"Your response must contain ONLY the code that satisfies the instruction. "
"Do not add any explanations, introductory sentences, or markdown formatting like ```python ... ```.\n\n"
"Request: {instruction}\n\n"
"Generated Code:"
)
return prompt.format(instruction=instruction)
@PROMPT_REGISTRY.register()
class DiyCodePrompt(PromptABC):
'''
The prompt for custom code operations.
'''
def __init__(self, prompt_template: str):
self.prompt_template = prompt_template
def build_prompt(self, **kwargs) -> str:
"""
Generate prompt using custom template.
"""
try:
return self.prompt_template.format(**kwargs)
except Exception as e:
# If formatting fails, return the original template
return self.prompt_template
from dataflow.core.prompt import PromptABC
from typing import Set
import string
class StrFormatPrompt(PromptABC):
"""
只需要提供 f_str_template。
- 自动从模板中解析出需要的字段(self.fields)
- build_prompt(**kwargs) 用 kwargs 直接渲染
- on_missing: 'raise' | 'empty',控制缺失字段时的行为
"""
def __init__(self, f_str_template: str = "{input_text}", on_missing: str = "raise"):
self.f_str_template = f_str_template
if on_missing not in ("raise", "empty"):
raise ValueError("on_missing must be 'raise' or 'empty'")
self.on_missing = on_missing
def build_prompt(self, need_fields, **kwargs):
# 校验缺失字段
missing = [f for f in need_fields if f not in kwargs]
if missing:
if self.on_missing == "raise":
raise KeyError(f"Missing fields for prompt: {missing}")
# 宽松模式:用空串补齐
for f in missing:
kwargs[f] = ""
prompt = self.f_str_template
for key, value in kwargs.items():
prompt = prompt.replace(f"{{{key}}}", str(value))
return prompt
'''
Prompts for Bottom-Up Then Top-dOwN (BUTTON) multi-turn dialogue generating pipeline.
'''
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
@PROMPT_REGISTRY.register()
class ExtractScenarioPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, conversation):
prompt = """
Please analyze the conversation below between a user and an
assistant bot and identify the general life scenario it
represents. Provide a concise overview of the scenario type,
such as 'booking flights' or 'ordering meals'. Avoid
mentioning specific details like numbers or items. Your
response should be a description of the scenario without
additional commentary, and should not exceed 10 words.
Conversation:
{conversation}
Concise Overview of the Scenario:
"""
return prompt.format(conversation=conversation)
@PROMPT_REGISTRY.register()
class ExpandScenarioPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, scenario):
prompt = """
Based on the provided daily scenario, creatively generate a new
and entirely different scenario. The new scenario must meet
the following requirements:
1. You may alter the action or subject of the original scenario.
2. The new scenario should differ substantially from the
original.
3. Ensure the new scenario is realistic and feasible within a
daily life context.
4. Retain the same format as the original scenario.
5. Limit your response to 10 words and present the new scenario
in a single sentence.
Original Scenario:
{scenario}
Modified Scenario:
"""
return prompt.format(scenario=scenario)
@PROMPT_REGISTRY.register()
class FuncAtomicTaskGeneratePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, scenario):
prompt = """
You are training a model that can take a user's task description
or query, and available functions as input, and generate a
sequence of function calls to accomplish the task. Currently,
you are generating basic atom tasks. Given a general life
scenario as the context, please generate a basic atom task
that can be accomplished in one step.
Requirements of the task:
1. The task should be a reasonable real life task based on the
given scenario, and can be accomplished in one step.
2. If you mention some information, criteria or constraints in
the task, please give the details of these information,
criteria or constraints. Do not assume the model has access
to your personal information or prior knowledge, and it does
not have chance to ask you for clarification.
3. Please give enough details and make the task description as
specific as possible, so the model can make deterministic
function calls with deterministic arguments. Do not include
any ambiguous or vague information.
4. Do not mention specific tools or functions in the task
description, and do not propose solutions, hints, or project
outcomes.
5. Limit the task description to 30 words, and avoid using
adjectives and ambiguous words.
Given Scenario:
{scenario}
Please give your response in one line directly, without any
extra notation or format:
"""
return prompt.format(scenario=scenario)
@PROMPT_REGISTRY.register()
class SequentialTaskGeneratePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, task):
prompt = """
You are training a model that can take a user's task description
or query, and available functions as input, and generate a
sequence of function calls to accomplish the task. Currently,
you are generating complex tasks for model training. Given a
task, you need to add a subsequent task for this given task
to make a more complex task.
The requirements for the subsequent task are as follows:
1. The subsequent task should use the output of the given task
as input.
2. The subsequent can only be conducted after the given task has
been completed.
3. The subsequent task and the given task can form a new
composition task, and composing them can make a more
complex multi-step task.
## Examples:
### Given Task: Give me a list of all the pets.
### Subsequent Task: What is the most common kind of pet in the
list?
### Composition Task: Check the most common kind of pet in the
list of all the pets.
### Given Task: Who is author of the book "The Great Gatsby"?
### Subsequent Task: When was the author of this book born?
### Composition Task: When was the author of the book "The Great
Gatsby" born.
### Given Task: Give me the flight schedule from London to
Edinburgh today.
### Subsequent Task: Which fight has the shortest duration?
### Composition Task: Give me the flight from London to
Edinburgh with the shortest duration according to the flight
schedule today.
### Given Task: Retrieve the headlines of the news today from
BBC.
### Subsequent Task: What is the sentiment of the news
respectively?
### Composition Task: What is the sentiment of each headline in
today's news from BBC?
### Given Task: Which team won the World Cup in 2018?
### Subsequent Task: What is the team's captain?
### Composition Task: Who is the captain of the team that won
the World Cup in 2018.
## Here is the given task, please give your response following
the above format:
### Given Task: {task}
"""
return prompt.format(task=task)
@PROMPT_REGISTRY.register()
class ParathenSeqTaskGeneratePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, task):
prompt = """
You are training a model that can take a user's task description
or query, and available functions as input, and generate a
sequence of function calls to accomplish the task. Currently,
you are generating complex tasks for model training. Given a
task, you need to add a paralle task and a subsequent task
for this given task to make a more complex task.
The requirements for the parallel task are as follows:
1. The parallel task should be related to the given task, and
the input should independent of the output of the given task.
2. The parallel task can conduct at the same time as the given
task, and they can be independent of each other.
3. The output of the given task and the parallel task can be
used together to conduct a subsequent task.
The requirements for the subsequent task are as follows:
1. The subsequent task should use the output of the given task
and generate parallel task as input.
2. The subsequent can only be conducted after the given task and
the parallel task have been completed.
3. The subsequent task, the given task and the parallel task can
form a new composition task, and composing them can make a
more complex multi-step task.
## Examples:
### Given Task: Give me a list of all the pets.
### Parallel Task: Find available pet food currently in the
store.
### Subsequent Task: Check if the pet food is suitable for the
pets in the list.
### Composition Task: Check if the pet food is suitable for the
pets in the list of all the pets.
### Given Task: When was the author of the book "The Great
Gatsby" born.
### Parallel Task: Find the publication date of the book "The
Great Gatsby".
### Subsequent Task: When the book was published, how long had
it been since the author was born?
### Composition Task: How old was the author of the book "The
Great Gatsby" when the book was published?
### Given Task: Give me the flight schedule from London to
Edinburgh today.
### Parallel Task: Find the every hour weather forecast in
Edinburgh today.
### Subsequent Task: What is the weather condition when the
first flight arrives?
### Composition Task: I am in London, and I want to know the
weather condition when the first flight arrives in Edinburgh
today.
### Given Task: What is the sentiment of each headline in today'
s news from BBC?
### Parallel Task: Find the sentiment of each headline in today'
s news from CNN.
### Subsequent Task: Which news source has more positive news
today?
### Composition Task: Compare the sentiment of each headline in
today's news from BBC and CNN, and check which news source
has more positive news.
### Given Task: Who is the captain of the team that won the
World Cup in 2018?
### Parallel Task: Who is the coach of the team that won the
World Cup in 2018?
### Subsequent Task: Are the captain and the coach from the same
country?
### Composition Task: Check if the captain and the coach of the
team that won the World Cup in 2018 are from the same country
.
## Here is the given task, please give your response following
the above format:
### Given Task: {task}
"""
return prompt.format(task=task)
@PROMPT_REGISTRY.register()
class CompositionTaskFilterPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, task, sub_tasks):
prompt = """
You are an expert in task decomposition. Currently, you are
given a composition task and its potential task breakdown.
Please check if the sub-tasks can be used to complete the
composition task.
Composition task:
{task}
Potential task breakdown:
{sub_tasks}
Please check if the sub-tasks can be used to complete the
composition task. You should first give your analysis and
thinking, and finally give your conclusion (yes or no)
enclosed in <ans>, for example, <ans>yes</ans> or <ans>no</ans>:
"""
return prompt.format(task=task, sub_tasks=sub_tasks)
@PROMPT_REGISTRY.register()
class FuncGeneratePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, task, sub_tasks):
prompt = """
You are training a model that can take a user's task description
or query, and available functions as input, and generate a
sequence of function calls to accomplish the task. Currently,
you are generating the training data for this model.
Given a composition task and its task breakdown, please
generate corresponding aviliable functions that can be used
to accomplish each sub-task, and finally the composition
task can be accomplished by calling these functions
sequentially.
## Requirements for the functions:
1. The functions must possess a succinct, comprehensible name
and description.
2. The functions should not tailored for a current task, are to
be used for other future tasks as well, hence the design of
APIs should be sufficiently generalized.
3. Avoid the recurrence of the task or its components in the
function description and name, offering a generic perspective
that can be employed across different contexts.
4. Make every function sufficiently granular and independent,
avoiding the conflation of multiple tasks within a single
function and avert creating monolithic APIs.
5. Consistency in terms of parameters and returns from each
function is critical. For instance, if two functions are
called sequentially, the output of the first should either
align with or constitute a part of the input for the second
function, irrespective of varying parameter terminologies.
## Requirements for the number of functions:
1. One sub-task may need zero, one or multiple functions to
complete it.
2. If a sub-task is about logic, comparision, set operation or
calculation, which can be solved by large language models,
then no function is needed for this sub-task, just leave the
func_list of this sub-task empty.
## Composition task:
{task}
## Task breakdown:
{sub_tasks}
## Response format:
'''json
[
{{
"sub_task": "a sub task from the task breakdown",
"func_list": [
{{
"name": "<function name>",
"description": "<function usage description>",
"parameters": {{
"<param1>": {{
"type": "<can be string, number, boolean,
object, array, enum and anyOf>",
"description": "<param1 description>",
... <more keys if needed>
}},
... <more parameters if needed>
}},
"required": "<array of required parameters, maybe
not all parameters above are required>"
"responses": {{
"<res1>" {{
"type": "<value1 type>",
"description": "<value1 description>"
}},
"<res2>": {{
"type": "<value2 type>",
"description": "<value2 description>"
}}
}}
}},
{{
... <more functions if needed>
}}
]
}}
... <more sub tasks and corresponding functions if needed>
]
'''
## Please respond following the format above:
"""
return prompt.format(task=task, sub_tasks=sub_tasks)
@PROMPT_REGISTRY.register()
class ConversationUserPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, task):
prompt = """
Assume you are playing the role of a user engaging with an AI assistant in a multi-turn task-solving scenario.
Currently, your goal is to complete a predefined task, and you
are seeking the AI assistant for this purpose.
**Task**
{task}
During this conversation, you should take on an active role and
explore the AI assistant's capability to solve problems \
within the **Task** using a series of function (tool) calls. You
should adhere to the following guidelines:
1. Your task involves a complex task requiring multiple steps to
complete. In your initial question to the AI assistant, you
should provide a detailed explanation of the task, including
necessary information (such as potential data) that might be
needed to solve the problem. However, you should withhold
specific solution steps (e.g., avoid sequential terms like "
firstly," "secondly") and not dictate which functions (tools)
the AI should use - that is for the AI to determine.
2. Remember, during this multi-turn dialogue, you are portraying
the role of a human user. Your questions and responses
should reflect this human aspect. All your outputs should
enclose within "<human>" tag, for example, "<human> ... </
human>".
"""
return prompt.format(task=task)
@PROMPT_REGISTRY.register()
class ConversationAssistantPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, sub_task, sub_task_func):
prompt = """
You are simulating the role of an expert in using functions (i.e
., tools) to solve users' tasks. You already possess
knowledge on how to decompose the task into subtasks and
understand which tools to use for their resolution.
**Subtasks**
{sub_task}
**Available Functions for Subtasks**
{sub_task_func}
Please use the tools provided above to answer the question posed
by "<human>". You must try as much as possible to use these
tools, instead of directly answering the question using your
prior knowledge.
Your response must obey the following format:
Observation: Carefully observe the user "<human>"'s question as
well as the output of the function call (often enclosed
within the "<func_return>" tag). Be sure to check for any
errors in previous outputs, as they may not always be
accurate. Enclose your observation within the "<observation>"
tag.
Thought: After observing and combining the previously listed
steps, give detailed and clear thoughts, reasonings, or
reflections, and according to the plan decide the next step.
Function Call: Name and arguments of the function call. The
function name must be same as its name in above function list
, and the arguments must obey the format required by the
function. Enclose the function call within the "<func_call>"
tag. If possible, you can call multiple functions in parallel
, be sure the functions called parallelly are independent of
each other.
Final Answer: When you believe the task is complete, you may
use 'final_answer' to provide a detailed summary of the
results to give to the user, enclose the final answer within
the tag "<final>".
Example 1 (regular function call):
<observation> User has provided two numbers - 15 and 25. </
observation>
<thought> Based on user's request, we need to find the greatest
common divisor of these two numbers. We can use the function
'find_greatest_common_divisor' to solve this problem. </
thought>
<func_call>[
{{
"name": "find_greatest_common_divisor",
"arguments": {{"num1": 15, "num2": 25}}
}}
]</func_call>
Example 2 (parallel function call):
<observation> User wants to know the weather in two cities - New
York and London. </observation>
<thought> We can use the function 'get_weather' to find the
weather in New York and London. And the call to this function
can be done in parallel. </thought>
<func_call>[
{{
"name": "get_weather",
"arguments": {{"city": "New York"}}
}},
{{
"name": "get_weather",
"arguments": {{"city": "London"}}
}}
]</func_call>
Furthermore, when the user "<human>" raises a question, you need
to provide a structured plan to solve the question ('
structured' means that the plan needs to include steps in
sequential order, such as Step 1, 2, 3, etc., or logic
processes that include loops and decision branches). The
contents of the plan can be placed in the first round
response's <thought>, and try as much as possible to follow
this plan in every subsequent function call. However, as
necessary, you may also modify the relevant plans according
to the result of the function call.
"""
return prompt.format(sub_task=sub_task, sub_task_func=sub_task_func)
@PROMPT_REGISTRY.register()
class ConversationToolPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, function):
prompt = """
You are simulating a computer system with powerful computational
capabilities and a complete setup. You possess ample
external prior knowledge, allowing you to run any arbitrary
function and execute calls to produce results, and you never
make errors. Give a following function, you should simulate
the operation of a computer system program as closely as
possible.
**Function**
{function}
Given a function call, you should execute the function and
provide the results in JSON format. Your response should
directly provide the results in JSON format, should not
contain irrelevant information, and must enclose within "<
func_return>" tag.
### Example of function return:
<func_call>
{{
"name": "get_weather",
"arguments": {{"city": "New York"}}
}}
<func_return>
{{
"temperature": "25C",
}}
</func_return>
"""
return prompt.format(function=function)
@PROMPT_REGISTRY.register()
class ConversationEvalPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, conversation):
prompt = """
You are a strict evaluator of function-calling dialogues.
You will be given only the conversation content (a list of messages with role and content).
This may include tags like <observation>, <thought>, <func_call>, <func_return>, <final>.
Your task: Assign a quality score 1 to 5 based solely on the correctness and logical flow of tool usage.
Scoring rules:
- 5 = Excellent: correct tool usage, logical sequence, arguments reasonable.
- 4 = Good: mostly correct, only minor flaws.
- 3 = Fair: noticeable problems but still somewhat usable.
- 2 = Poor: major issues, unreliable.
- 1 = Very Bad: incoherent or irrelevant tool use.
You must output strictly in JSON format only, with no additional text.
Output format:
{{
"score": <int from 1 to 5>,
"explanation": "<short reasoning>"
}}
Now evaluate this conversation:
<conversation>
{conversation}
</conversation>
"""
return prompt.format(conversation=conversation)
\ No newline at end of file
import random
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
'''
A collection of prompts for the general text operator.
'''
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
@PROMPT_REGISTRY.register()
class Phi4QAGeneratorPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, content: str) -> str:
"""
Generate the LLM input prompt by inserting the raw content into the prompt template.
"""
prompt = """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the questions.
Convert the following paragraph into a conversational format with multiple tags of "Question:" followed by "Answer:":
You can only output as the given format:
Question: xxx Answer: xxx
Question: xxx Answer: xxx
Now please covert the content below.
{content}
"""
return prompt.format(content=content)
@PROMPT_REGISTRY.register()
class SFTGeneratorSeedPrompt(PromptABC):
def __init__(self, custom_prompt):
self.custom_prompt = custom_prompt
pass
def build_prompt(self, content: str = "") -> str:
"""
Generate the LLM input prompt by inserting the raw content into the prompt template,
with optional custom instructions to enhance flexibility.
"""
base_prompt = """You are tasked with creating high-quality SFT data for LLM training.
Please generate one question based on the provided context, focusing on diversity, relevance, and clarity.
Requirements:
1. Generate exactly one distinct and well-formed question.
2. The question must be based on the context and include enough background for clarity.
3. Output must follow this JSON format:
{{
"instruction": "QUESTION",
"output": "ANSWER"
}}
Examples:
{{
"instruction": "Can you provide a list of healthy habits to maintain a healthy lifestyle? Please format your response as an HTML page with bullet points.",
"output": "Here's an HTML page with bullet points for healthy habits: <html><body><h3>Healthy Habits:</h3><ul><li>Eating a balanced diet...</li></ul></body></html>"
}},
{{
"instruction": "How can we use Python to calculate the GCD (greatest common divisor) of five numbers and express each number in terms of the GCD?",
"output": "Here's a Python function that calculates the GCD of five numbers: def find_gcd(...) ..."
}}
{custom_section}
Now, based on the following context, please generate one question:
"""
custom_section = f"Additional instruction:\n{self.custom_prompt}\n" if self.custom_prompt else ""
full_prompt = base_prompt.format(custom_section=custom_section)
return f"<|im_start|>system\n{full_prompt}<|im_end|>\n<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant"
import textwrap
import textwrap
@PROMPT_REGISTRY.register()
class MetaPrompt(PromptABC):
def __init__(self, dimensions):
self.dimensions = self._format_dimensions(dimensions=dimensions)
self.system_prompt_template = textwrap.dedent("""\
You are an expert evaluator of text content. You will be given a single piece of text and must evaluate it across six specific dimensions listed below. Each dimension includes a description and a list of concrete examples (example_list), each labeled with a quality score. Higher scores indicate better quality. Use these examples to guide your assessment.
{dimensions_list}
Instructions:
- Provide a clear evaluation for each of the six dimensions based on the input text.
- Each evaluation should be one short paragraph.
- Then assign an integer score from 1 to 5 for each dimension, where:
5 = Excellent
4 = Good
3 = Fair
2 = Poor
1 = Very Poor
- Your output should end with a **separate final line** that contains a Python-style list of six integers in this format:
[5, 4, 3, 5, 4, 5]
""")
self.user_prompt_template = textwrap.dedent("""\
Please analyze and evaluate the following text:
Text:
{text}
Your output should include:
- One paragraph of analysis for each of the six quality dimensions listed above.
- A final line with your scores in this exact format:
[score1, score2, score3, score4, score5, score6]
""")
def _format_dimensions(self, dimensions):
formatted_list = []
for i, item in enumerate(dimensions, 1):
examples_str = "\n".join([
f'Example (Score: {ex["score"]}):\n"{ex["text"]}"\n'
for ex in item["example_list"]
])
block = f"""\"\"\"{i}. {item["dimension_name"]}: {item["description"]}
{examples_str}\"\"\""""
formatted_list.append(block)
return formatted_list
def build_system_prompt(self):
dimensions_text = "\n".join(self.dimensions)
return self.system_prompt_template.format(dimensions_list=dimensions_text)
def build_prompt(self, text):
return self.user_prompt_template.format(text=text)
@PROMPT_REGISTRY.register()
class AlpagasusPrompt(PromptABC):
def __init__(self, dimension='quality'):
self.dimension = dimension
self.system_prompt_template = """
We would like to request your feedback on the performance of AI assistant in response to the instruction and the given input displayed following.
Instruction: {instruction}
Input: {input}
Response: {response}
"""
self.user_prompt_template = """
Please rate according to the {dimension} of the response to the instruction and the input. Each assistant
receives a score on a scale of 0 to 5, where a higher score indicates a higher level of the {dimension}. Please
first output a single line containing the value indicating the scores. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias.
"""
def build_system_prompt(self, instruction, input_text, response):
"""
生成system prompt
"""
return self.system_prompt_template.format(instruction=instruction, input=input_text, response=response)
def build_prompt(self):
"""
生成user prompt
"""
return self.user_prompt_template.format(dimension=self.dimension)
@PROMPT_REGISTRY.register()
class TreeinstructPrompt(PromptABC):
def __init__(self):
self.system_prompt_template = """
You are an instruction rewriter. You need to parse a given user instruction into a TREE structure following Semantic Parsing in the natural language processing field.
Procedure:
step-1: Parse the old “instruction” to a TREE-1 through Semantic Parsing in the natural language processing field.
Count and return the number of nodes in TREE-1.
Old instruction: “{instruction}”
"""
self.user_prompt_template = """
Please count and return the number of nodes in TREE-1. This number represents the complexity of the original instruction.
Output the number in the single LAST line. You must ensure the last line is only the number of the tree, without other symbols, like ```.
For example:
4
"""
def build_system_prompt(self, instruction):
"""
根据给定的指令生成 system prompt
"""
return self.system_prompt_template.format(instruction=instruction)
def build_prompt(self):
"""
生成 user prompt
"""
return self.user_prompt_template
@PROMPT_REGISTRY.register()
class ConsistentQueryPrompt(PromptABC):
def __init__(self):
self.intent_categories = {
"Problem Solving Interaction": [
"From Problem Diagnosis to Solution Optimization"
],
"Educational Interaction": [
"From Broad Theory to Specific Scenarios",
"From Basic Concepts to Cross-Domain Connections"
],
"Health Consultation Interaction": [
"From Problem Diagnosis to Solution Optimization",
"From Hypothesis Testing to Substantive Discussion"
],
"Exploratory Interaction": [
"From Time Sequence Expansion to Explore Causes and Effects",
"From Hypothesis Testing to Substantive Discussion"
],
"Entertainment Interaction": [
"From Single Perspective to Multiple Perspectives",
"From Hypothesis Testing to Substantive Discussion"
],
"Simulation Interaction": [
"From User Needs to Solutions",
"From Broad Theory to Specific Scenarios"
],
"Emotional Support Interaction": [
"From Single Perspective to Multiple Perspectives",
"From User Needs to Solutions"
],
"Information Retrieval Interaction": [
"From Basic Concepts to Cross-Domain Connections",
"From Time Sequence Expansion to Explore Causes and Effects"
],
"Transaction Interaction": [
"From User Needs to Solutions",
"From Problem Diagnosis to Solution Optimization"
]
}
self.topic_dict = {
"Problem Solving Interaction": [
"Technical support for computer hardware issues",
"Home repair advice for plumbing problems",
"Planning a budget-friendly vacation",
"Fixing issues with internet connectivity",
"Setting up a smart home system",
"Solving problems with a broken washing machine",
"Troubleshooting a malfunctioning printer",
"How to repair a car engine",
"Fixing a cracked phone screen",
"Troubleshooting Wi-Fi network issues",
"Diagnosing problems with a non-responsive remote control",
"How to reset a frozen smartphone",
"Dealing with an overheating laptop",
"Replacing a broken laptop screen",
"How to upgrade computer RAM",
"Fixing a leaking faucet",
"How to unclog a kitchen sink",
"Diagnosing a noisy refrigerator",
"How to seal window drafts",
"Troubleshooting a non-working ceiling fan",
"Setting up a home office on a budget",
"Fixing a car that won’t start in cold weather",
"How to troubleshoot GPS navigation issues",
"Fixing problems with a garage door opener",
"Troubleshooting smart light bulbs that won’t connect",
"Replacing a broken door lock",
"Fixing a noisy air conditioning unit",
"Troubleshooting camera connectivity on a laptop",
"How to repair a broken headphone jack",
"Setting up a secure home Wi-Fi network",
"Replacing a smartphone battery",
"Installing a wall-mounted TV safely",
"Calibrating a smart thermostat",
"Fixing screen flickering on a monitor",
"Diagnosing strange noises from a desktop computer",
"Solving Bluetooth connection problems",
"Repairing a jammed paper shredder",
"Troubleshooting slow smartphone performance",
"How to stop water leakage under a bathroom sink",
"Installing weather stripping on doors",
"Setting up parental controls on a router",
"Fixing a dishwasher that won’t drain",
"Repairing a damaged phone charging port",
"Replacing a worn-out windshield wiper",
"How to fix a garage light that keeps flickering",
"Solving battery drain issues in electric vehicles",
"Resetting a smart TV to factory settings",
"Troubleshooting a wireless keyboard that won't connect",
"How to install a backup camera in a car"
],
"Educational Interaction": [
"Learning a new language online",
"Understanding the basics of physics",
"Music theory and basic chord progressions",
"The basics of machine learning and AI",
"Introduction to computer programming",
"Understanding the structure of DNA",
"Exploring the history of the Roman Empire",
"The principles of economics",
"The process of photosynthesis in plants",
"Studying the human circulatory system",
"Learning algebra and solving equations",
"The basics of chemistry and atomic structure",
"Studying world geography",
"Learning about climate change and sustainability",
"Understanding how the internet works",
"Intro to creative writing techniques",
"Basics of digital photography",
"Understanding historical timelines",
"Learning financial literacy and budgeting",
"Exploring different art movements",
"Understanding gravity and Newton’s laws",
"Learning HTML and CSS for web design",
"Exploring the solar system",
"Basics of environmental science",
"Introduction to statistics",
"Learning about the American Civil War",
"Understanding cultural anthropology",
"Exploring human anatomy",
"Learning basic sign language",
"Intro to public speaking skills",
"Introduction to ethical philosophy",
"Learning how to conduct scientific experiments",
"Studying global political systems",
"Understanding basic genetics and heredity",
"Learning how to analyze literature",
"Basics of entrepreneurship and starting a business",
"Studying ancient civilizations like Mesopotamia and Egypt",
"Introduction to psychology and behavior",
"Basics of digital citizenship and online safety",
"Understanding the water cycle and weather patterns",
"Learning how to write a research paper",
"Studying global religions and belief systems",
"Intro to logic and critical thinking",
"Understanding supply and demand in markets",
"Learning spreadsheet skills (e.g., Excel or Google Sheets)",
"Introduction to cybersecurity principles",
"Understanding different learning styles",
"Basics of health and nutrition science",
"Learning how to debate effectively"
],
"Health Consultation Interaction": [
"Tips for maintaining a healthy diet",
"Analyzing symptoms of the common cold",
"Dealing with seasonal allergies",
"Understanding mental health and depression",
"Health benefits of regular exercise",
"Managing high blood pressure",
"Identifying signs of anxiety disorder",
"Dealing with insomnia and sleep problems",
"Coping with stress in the workplace",
"Understanding the impact of smoking on health",
"Preventing type 2 diabetes through lifestyle changes",
"Dealing with chronic back pain at home",
"How to support immune health naturally",
"Recognizing early signs of dehydration",
"Understanding the effects of caffeine on the body",
"Managing cholesterol through diet",
"How to build a sustainable workout routine",
"Mental health tips for remote workers",
"Safe exercises for people with joint pain",
"How to talk to a doctor about personal health concerns",
"Advice for managing menstrual cramps",
"Tips for healthy weight loss",
"Understanding the role of sleep in mental wellness",
"How to identify food intolerances",
"Preventing common sports injuries",
"Maintaining good posture while working",
"Recognizing early signs of burnout",
"How to manage asthma symptoms",
"The importance of hydration for brain function",
"Understanding the risks of sedentary lifestyles",
"Managing digestive issues like bloating or IBS",
"How to support bone health as you age",
"Tips for quitting alcohol or reducing intake",
"Understanding the benefits of mindfulness and meditation",
"Recognizing signs of vitamin deficiency",
"Safe stretching routines for flexibility",
"How to create a balanced meal plan",
"Managing migraines and chronic headaches",
"Supporting eye health in the digital age",
"Understanding how hormones affect mood and health",
"Caring for skin during seasonal changes",
"Understanding the basics of reproductive health",
"Dealing with minor injuries at home (cuts, sprains)",
"Tips for building mental resilience",
"Creating a daily self-care routine",
"Navigating food labels and nutrition facts",
"Identifying signs of eating disorders",
"How to stay active while traveling",
"The role of gut health in overall wellness"
],
"Exploratory Interaction": [
"Exploring the concept of time travel",
"Deep-sea exploration and underwater ecosystems",
"Historical events that shaped the world",
"The impact of artificial intelligence on society",
"Exploring the mysteries of the Bermuda Triangle",
"Investigating space exploration and Mars missions",
"The history of human migration",
"The future of renewable energy",
"The impact of global warming on biodiversity",
"Exploring the ancient pyramids of Egypt",
"Uncovering the secrets of black holes",
"The cultural significance of ancient myths",
"Exploring parallel universes and multiverse theories",
"The origins and evolution of language",
"How ancient civilizations built megastructures",
"The search for extraterrestrial life",
"How volcanoes have shaped Earth’s surface",
"The psychology of dreams and their meanings",
"The science behind natural disasters",
"Exploring the concept of simulated reality",
"How ancient trade routes influenced global development",
"Exploring lost civilizations and archaeological mysteries",
"The evolution of the internet and digital culture",
"How pandemics have influenced human history",
"The ethics of genetic modification",
"Exploring the possibility of underwater cities",
"How cultural identity evolves through migration",
"The role of philosophy in modern science",
"Unsolved mysteries in astrophysics",
"Exploring ancient astronomical observatories",
"The influence of mythologies on modern storytelling",
"How ancient weather patterns affected human settlement",
"Exploring the idea of colonizing other planets",
"The rise and fall of legendary empires",
"The possibility of time dilation in deep space travel",
"The influence of alchemy on early science",
"Understanding cryptids and mythological creatures",
"Exploring the legends of Atlantis",
"How music evolved across civilizations",
"The significance of sacred geometry in ancient structures",
"How ancient calendars predicted celestial events",
"The philosophy of consciousness and existence",
"Exploring the science behind telepathy and ESP",
"The history of espionage and intelligence gathering",
"How plagues transformed the course of empires",
"The psychology behind conspiracy theories",
"Exploring the idea of digital immortality",
"How ancient seafaring changed the world map",
"The role of chaos theory in understanding the universe"
],
"Entertainment Interaction": [
"Creating a video game character",
"Writing a mystery novel",
"Designing a new board game",
"Exploring a new fantasy world in literature",
"The psychology behind horror movies",
"The evolution of action films",
"Playing a strategic card game",
"Exploring the art of stand-up comedy",
"How to produce an indie film",
"Creating an engaging video game storyline",
"Writing a screenplay for a short film",
"Building a fantasy football team",
"Exploring behind-the-scenes movie production",
"Learning the basics of animation",
"Creating your own comic book series",
"Composing an original song",
"Understanding character arcs in drama series",
"Creating a YouTube channel for entertainment",
"Developing a murder mystery dinner party game",
"Exploring cosplay and costume design",
"Designing the rules for a role-playing game",
"Recording a podcast about pop culture",
"Writing a fan fiction story",
"Creating a music video on a budget",
"Directing a scene with amateur actors",
"Exploring live streaming as an entertainer",
"Hosting an online trivia night",
"Analyzing what makes a sitcom successful",
"Creating viral content for social media",
"Building a digital art portfolio for entertainment",
"Learning how to voice act for animations or games",
"Creating an interactive story with branching choices",
"Reviewing and critiquing movies or TV shows",
"Designing merchandise for a fictional brand",
"Building a fictional world map for a fantasy series",
"Creating theme music for a character or story",
"Learning stage acting vs. screen acting",
"Writing and performing a comedy skit",
"Planning a virtual concert or talent show",
"Designing a puzzle game with narrative elements",
"Writing a parody song or video",
"Hosting a fictional radio show",
"Analyzing storytelling techniques in video games",
"Developing an ARG (Alternate Reality Game)",
"Creating concept art for a fantasy setting",
"Writing dialogue for an animated series",
"Planning a short film festival with friends",
"Exploring sound design for entertainment media",
"Building a fan community around fictional works"
],
"Simulation Interaction": [
"Business negotiations and decision-making",
"Military strategy and planning simulations",
"Simulation for emergency disaster response",
"Flight training using simulators",
"Healthcare simulation for medical professionals",
"Simulating financial market crashes",
"Simulating environmental disaster scenarios",
"Running a simulated space mission",
"Simulating customer service interactions",
"Creating a disaster management simulation game",
"Simulating a day in the life of a CEO",
"Virtual reality driving test training",
"Crisis management simulation for public relations",
"Political campaign simulation and voter behavior",
"Simulating ethical dilemmas in AI development",
"Simulating the spread of infectious diseases",
"Urban planning simulation for smart cities",
"Simulating climate change over 100 years",
"Training simulations for cybersecurity breaches",
"Economic policy decision-making simulation",
"Simulating courtroom trials and legal strategy",
"Simulation for emergency room triage",
"Virtual surgery practice for medical students",
"Simulating supply chain disruptions",
"Simulating archaeological digs and discoveries",
"Spacewalk training in zero-gravity simulation",
"Language learning through role-playing simulation",
"Simulating diplomatic negotiations between countries",
"Astronaut survival training simulation",
"Simulating startup business pitch competitions",
"Simulating historical battles for education",
"Virtual restaurant management and customer flow simulation",
"Simulating the effects of social media algorithms",
"Driving public transportation in urban simulations",
"Simulating a courtroom debate in a mock trial",
"Disaster recovery planning for IT infrastructure",
"Simulating election outcomes based on real-time data",
"Simulation of water resource management in agriculture",
"Creating a theme park operations simulator",
"Simulating robotics navigation in dynamic environments",
"Simulated coaching for sports teams",
"Simulating ethical decision-making in journalism",
"Simulating airport ground operations and logistics",
"Simulating the development of a new pharmaceutical drug",
"Simulated investment portfolio risk management",
"Simulating refugee crisis response scenarios",
"Virtual museum curation and exhibition planning",
"Simulating interpersonal communication in therapy sessions",
"Simulating AI behavior in self-driving vehicles",
"Virtual internship simulation for workplace readiness"
],
"Emotional Support Interaction": [
"Coping with the death of a loved one",
"Supporting a friend through a breakup",
"Dealing with feelings of loneliness",
"Coping with stress and work-life balance",
"Managing anxiety during uncertain times",
"Dealing with feelings of inadequacy",
"Supporting someone going through mental health challenges",
"Building resilience after a setback",
"Managing anger and frustration",
"Finding emotional support after a major life change",
"Handling the emotional impact of job loss",
"Coping with social anxiety in group settings",
"Dealing with the fear of failure",
"Recovering from a toxic relationship",
"Supporting a child through emotional distress",
"Dealing with homesickness when living abroad",
"Finding motivation during depressive episodes",
"Coping with a chronic illness diagnosis",
"Navigating emotional burnout as a caregiver",
"Overcoming feelings of rejection",
"Learning to forgive yourself after a mistake",
"Supporting a partner dealing with trauma",
"Handling the emotions of being a new parent",
"Rebuilding confidence after public embarrassment",
"Managing expectations during major life transitions",
"Dealing with guilt from past decisions",
"Helping someone through a panic attack",
"Coping with grief after a pet passes away",
"Facing loneliness during the holiday season",
"Balancing emotional vulnerability and self-protection",
"Processing emotions after a traumatic event",
"Helping teens deal with peer pressure",
"Managing jealousy in a relationship",
"Supporting an elderly parent with emotional needs",
"Navigating friendship breakups with maturity",
"Coping with fear of the future",
"Dealing with body image issues and self-worth",
"Handling emotional distance in long-term relationships",
"Managing stress related to academic pressure",
"Providing comfort to someone experiencing shame",
"Processing mixed emotions after a big achievement",
"Supporting someone with PTSD triggers",
"Coping with infertility and emotional distress",
"Rebuilding trust after betrayal",
"Helping a loved one experiencing suicidal thoughts",
"Dealing with emotional triggers in daily life",
"Finding peace with an unresolved conflict",
"Managing emotions after relocation or immigration",
"Coping with fear of abandonment"
],
"Information Retrieval Interaction": [
"Finding the best tech product reviews online",
"Looking up information on the latest scientific discoveries",
"How to find reliable health advice on the internet",
"Searching for a vacation destination based on reviews",
"Finding the most recent climate change data",
"Looking for historical documents on ancient civilizations",
"Researching news about artificial intelligence advancements",
"Finding user reviews for a new gadget",
"Searching for scholarly articles on quantum computing",
"Finding government reports on public health",
"Locating top-rated online courses for career development",
"Finding official information on visa requirements",
"Researching the latest trends in the stock market",
"Finding statistical data for academic research",
"Looking up real-time traffic and commute updates",
"Finding reviews and ratings for local restaurants",
"Searching for housing market reports in a specific city",
"Finding information on upcoming local events",
"Researching criminal records or public legal cases",
"Finding comparison data on different insurance policies",
"Searching for open-source software alternatives",
"Looking up case studies for business or marketing",
"Finding details on government aid programs",
"Researching side effects of prescription medications",
"Finding technical documentation for programming libraries",
"Looking up airline safety records",
"Searching for consumer complaint databases",
"Finding educational videos on historical topics",
"Researching the genealogy of a family name",
"Looking up employment law information by state",
"Finding patent information for a new invention",
"Researching cultural practices in different countries",
"Searching for reviews of online learning platforms",
"Finding data on renewable energy usage by country",
"Looking up public records on local property ownership",
"Finding historical weather data for a location",
"Searching for quotes and citations in classic literature",
"Finding nutrition information for restaurant meals",
"Researching ethical sourcing of fashion brands",
"Looking up vehicle recall history by VIN",
"Finding demographic data for a specific region",
"Researching nonprofit organization transparency reports",
"Searching for academic conference proceedings",
"Finding ratings and reviews of mobile apps",
"Looking up historical election results by district",
"Finding documentation on space exploration missions",
"Researching funding opportunities for small businesses",
"Searching for media coverage on social justice issues",
"Finding open data sets for machine learning training",
"Looking up safety information on household chemicals"
],
"Transaction Interaction": [
"Booking a flight online for a vacation",
"How to purchase concert tickets online",
"Making an appointment with a service provider",
"Ordering food online for delivery",
"Purchasing a product through an e-commerce site",
"How to buy insurance online",
"Scheduling a medical appointment",
"Making a donation to a charity online",
"Buying a gift card for a friend",
"How to apply for a mortgage loan",
"Renewing a vehicle registration online",
"Paying utility bills through a mobile app",
"Booking a hotel room for a weekend trip",
"Registering for an online course or certification",
"Subscribing to a streaming service",
"Buying event tickets with a digital wallet",
"Applying for a credit card through a website",
"Reserving a rental car at the airport",
"Paying property taxes online",
"Purchasing digital books or audiobooks",
"Ordering groceries from an online supermarket",
"Paying tuition fees through a university portal",
"Signing up for a gym membership online",
"Applying for unemployment benefits digitally",
"Reserving a table at a restaurant using an app",
"Buying and downloading software securely",
"Sending money internationally via online banking",
"Registering a domain and hosting a website",
"Buying stocks or cryptocurrency through a trading platform",
"Purchasing travel insurance before a trip",
"Ordering custom clothing or merchandise online",
"Buying a used car through an online marketplace",
"Paying for public transportation with a mobile wallet",
"Subscribing to a monthly subscription box service",
"Purchasing online advertising for a small business",
"Topping up a prepaid phone plan online",
"Paying for freelance services via a gig platform",
"Placing a mobile order for in-store pickup",
"Applying for a personal loan through a fintech app",
"Booking a guided tour or local experience online",
"Paying entry fees for a virtual event or webinar",
"Setting up automatic payments for monthly bills",
"Buying furniture or home goods with financing options",
"Purchasing digital game content or in-app items",
"Contributing to a crowdfunding campaign",
"Paying for parking through a mobile parking app",
"Ordering prescription medication online",
"Reserving coworking space for remote work",
"Paying for tutoring or online lessons"
]
}
def build_prompt(self, num_dialogs_per_intent):
prompt = """
Task Description and Rules
1. Generate multiple rounds of realistic user questions based on the provided topic:
- Based on a single core topic (provided directly by the user), generate multiple rounds of realistic user questions, comprising 6-8 turns in total.
- The questions should match the characteristics of real users in natural communication: sometimes simple, sometimes vague, or including contextual backgrounds, and should reflect the language style of daily communication.
- Note: Avoid directly including the exact expression of the input topic in the questions. Instead, abstract it with natural and conversational language in practical scenarios.
2. Dynamic Dialogue Information Flow in Conversations: Below are the relevant steps of the information flow: {info_flow}
The dialogue style should adhere to the following requirements:
- Utilize natural phrasing and vivid language, avoiding overly mechanical responses.
- Favor shorter sentences in questions, with occasional subject omission allowed.
- Ensure smooth and logical transitions through lighthearted or entertaining interjections.
- Permit the expression of specific personality traits and individualized tones.
- Proactively introduce new topics when appropriate, ensuring relevance to the current theme.
The dialogue should comply with the following generation rules:
- For each round of dialogue, only simulate user questions without providing answers.
- Ensure the conversation flows naturally and reflects realistic interactive thinking.
- Avoid overly polished or templated content, ensuring the questions feel authentic and relatable in life scenarios.
Output Format:
Multi-turn Questions in JSON Format:
"category": "<Core Topic of the Conversation>",
"turns": ["<turn_1>", "<turn_2>", "<turn_3>", "..."]
To generate multi-turn queries with high topic consistency, please think step-by-step.
The input core topic for this task is: {topic}
"""
all_query_prompts = []
for intent, info_flows in self.intent_categories.items():
for _ in range(num_dialogs_per_intent):
info_flow = random.choice(info_flows)
topic = random.choice(self.topic_dict[intent])
query_prompt = prompt.format(info_flow=info_flow, topic=topic)
all_query_prompts.append(query_prompt)
return all_query_prompts
@PROMPT_REGISTRY.register()
class ConsistentResponsePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, topic, queries):
prompt = f"""
Your task is to simulate a multi-turn conversation where you progressively answer a series of user questions provided under a given topic category. For each answer, focus on delivering a natural, contextually relevant, and actionable response while considering both the current question and future questions in the sequence. The goal is to ensure consistency and logical progression throughout the dialogue and to avoid unnecessary follow-up questions in the responses simultaneously. To generate multi-turn responses with high topic consistency, think step-by-step. Key Dialogue Style Requirements are as follows:
Content and Structure:
1. Directly Answer the Current Question:
- Provide a complete, useful response to the current question without posing additional questions unless they are directly relevant to future queries.
- If clarification or additional steps are needed, frame these as suggestions or explanations rather than questions.
2. Be Context-Aware:
- Always tailor each response to the current question while remaining mindful of the context provided by prior and future questions.
- Avoid prematurely addressing future queries but create subtle links where necessary to ensure smooth progression.
3. Clear, Action-Oriented Responses:
- Focus on providing actionable advice, logical explanations, or troubleshooting steps rather than speculative or rhetorical remarks.
- Avoid long or overly complex explanations; aim for clarity and efficiency.
Tone and Style:
1. Conversational and Supportive:
- Use a natural, empathetic tone that simulates real-life problem-solving interactions.
- Avoid mechanical or overly formal responses.
2. Economical with Words:
- Keep responses concise but informative. Minimize extraneous content while ensuring answers have enough detail to be helpful.
3. No Unnecessary Questions:
- Limit unnecessary questions in the responses and focus instead on providing actionable steps or solutions directly. Avoid follow-up questions that don’t align with the next user query.
Turn-by-Turn Instructions:
1. Answer Exclusively for the Current Question:
- For each turn, generate an answer that directly addresses the immediate question. Avoid revisiting past details unnecessarily unless they are highly relevant.
- While you shouldn’t anticipate or directly answer future queries, your response should create natural openings for upcoming questions if applicable.
2. Avoid Irrelevant Follow-Up Questions:
- If the immediate question doesn’t require clarification, frame your response as a statement or suggestion rather than a question.
- Maintain alignment with the logical flow of dialogue to ensure each turn is coherent.
3. Proactively Provide Scenarios or Steps:
- Where appropriate, guide the user with specific recommendations, troubleshooting actions, or observations they can make without requiring back-and-forth clarification.
Output Requirements:
The output must simulate the conversation by only providing responses (one per turn) in a sequential manner. The final format must strictly adhere to valid JSON and include the required structure.
The input core topic and questions-only turns for this task is:
core topic: {topic}
queries:
{', '.join([f'User query: {query}' for query in queries])}
"""
return prompt
@PROMPT_REGISTRY.register()
class CondorQuestionPrompt(PromptABC):
def __init__(self):
self.tag = {
"Marriage and Relationships": {
"Dating and Friendship": ["Dating Platforms", "Dating Tips", "Dating Events"],
"Marriage Management": ["Marital Relationships", "Marriage Law", "Marriage Counseling"],
"Wedding Planning": ["Wedding Planning", "Wedding Photography", "Wedding Venues"],
"Relationship Psychology": ["Relationship Psychology", "Communication Skills in Relationships", "Relationship Maintenance"],
"Emotional Counseling": ["Solving Emotional Issues", "Emotional Repair", "Emotional Growth"],
"Pre-Marriage Education": ["Pre-Marriage Preparation", "Pre-Marriage Psychology", "Pre-Marriage Legal Knowledge"]
},
"Entertainment Gossip": {
"Celebrity News": ["Celebrity News", "Celebrity Interviews", "Celebrity Charity Events"],
"Variety Shows": ["Show Recommendations", "Behind the Scenes", "Show Interaction"],
"Film and TV Reviews": ["Movie Reviews", "TV Series Reviews", "Critics’ Opinions"],
"Entertainment News": ["Latest Entertainment News", "Entertainment Events", "Exclusive Interviews"],
"Fan Culture": ["Fan Activities", "Fan Support", "Fan Interactions"],
"Gossip": ["Celebrity Gossip", "Entertainment Industry Secrets", "Gossip Chasing"]
},
"Artificial Intelligence": {
"Machine Learning": ["Algorithm Principles", "Application Cases", "Learning Resources"],
"Deep Learning": ["Neural Networks", "Deep Learning Frameworks", "Deep Learning Applications"],
"Natural Language Processing": ["Language Models", "Text Analysis", "Dialogue Systems"],
"Computer Vision": ["Image Recognition", "Video Processing", "Vision Algorithms"],
"Intelligent Robotics": ["Robotics Technology", "Service Robots", "Industrial Robots"],
"Autonomous Driving": ["Autonomous Driving Technology", "Autonomous Driving Regulations", "Autonomous Driving Testing"]
},
"Healthcare": {
"Disease Prevention and Treatment": ["Common Diseases", "Preventive Measures", "Disease Treatment"],
"Health and Wellness": ["Dietary Wellness", "Exercise Wellness", "Traditional Chinese Medicine Wellness"],
"Psychological Counseling": ["Mental Health Issues", "Psychological Therapy", "Psychological Adjustment"],
"Medical Technology": ["Medical Equipment", "Medical Technology", "Medical Innovation"],
"Health Insurance": ["Types of Insurance", "Insurance Choices", "Insurance Claims"],
"Fitness": ["Fitness Methods", "Fitness Equipment", "Fitness Diet"]
},
"Pets": {
"Pet Care": ["Daily Pet Care", "Pet Nutrition", "Pet Behavior"],
"Pet Medical Care": ["Pet Diseases", "Pet First Aid", "Pet Hospitals"],
"Pet Training": ["Basic Training", "Behavior Correction", "Training Techniques"],
"Pet Supplies": ["Toys", "Food", "Care Products"],
"Pet Adoption": ["Adoption Procedures", "Adoption Conditions", "Adoption Events"],
"Pet Activities": ["Pet Competitions", "Pet Gatherings", "Pet Festivals"]
},
"Environment": {
"Environmental Protection": ["Ecological Protection", "Pollution Control", "Environmental Monitoring"],
"Sustainable Development": ["Green Energy", "Circular Economy", "Ecological Agriculture"],
"Energy Conservation and Emission Reduction": ["Energy-Saving Technology", "Emission Reduction Policies", "Low-Carbon Life"],
"Waste Sorting": ["Sorting Standards", "Sorting Methods", "Recycling"],
"Environmental Policies": ["Policy Regulations", "Policy Interpretation", "Policy Impact"],
"Green Living": ["Green Consumption", "Green Travel", "Green Buildings"]
},
"Technology": {
"Internet": ["Network Technology", "Cybersecurity", "Online Services"],
"5G Communication": ["5G Technology", "5G Applications", "5G Devices"],
"Blockchain": ["Blockchain Principles", "Blockchain Applications", "Digital Currency"],
"Artificial Intelligence": ["AI Technology", "AI Ethics", "AI Industry Applications"],
"Aerospace": ["Aerospace Technology", "Aircraft", "Space Exploration"],
"New Energy": ["Solar Energy", "Wind Energy", "New Energy Vehicles", "Energy Storage"]
},
"Education and Training": {
"Preschool Education": ["Choosing Kindergartens", "Early Childhood Education", "Preschool Education Policies"],
"K12 Education": ["Primary Education", "Secondary Education", "Family Education Guidance"],
"Higher Education": ["University Major Selection", "Graduate Education", "Higher Education Policies"],
"Vocational Training": ["Vocational Skills Training", "Professional Certifications", "Career Development Planning"],
"Online Education": ["Online Course Recommendations", "Distance Education", "Online Learning Tips"],
"Study Abroad and Immigration": ["Study Abroad Consultation", "Immigration Policies", "Overseas Living Guide"]
},
"Career Development": {
"Career Planning": ["Career Positioning", "Career Development Paths", "Career Transition Guidance"],
"Job Search Skills": ["Resume Writing", "Interview Skills", "Job Search Channels"],
"Career Advancement": ["Promotion Strategies", "Workplace Performance", "Leadership Development"],
"Interpersonal Relationships": ["Colleague Interaction", "Workplace Communication", "Workplace Etiquette"],
"Entrepreneurship Guidance": ["Entrepreneurship Plans", "Entrepreneurship Resources", "Entrepreneurship Risk Management"],
"Team Management": ["Team Building", "Team Collaboration", "Team Performance Management"]
},
"Finance and Investment": {
"Stocks": ["Stock Market Analysis", "Stock Investment Strategies", "Stock Research"],
"Funds": ["Fund Selection", "Systematic Investment Plans", "Fund Risk Management"],
"Futures": ["Futures Market", "Futures Trading Skills", "Futures Risk Control"],
"Foreign Exchange": ["Forex Trading", "Forex Market Analysis", "Forex Risk Management"],
"Insurance": ["Insurance Product Selection", "Insurance Planning", "Insurance Claims"],
"Financial Planning": ["Personal Finance", "Asset Allocation", "Retirement Planning"]
},
"Real Estate and Home Living": {
"Real Estate Market": ["Market Trends", "Property Price Analysis", "Real Estate Policy Interpretation"],
"Home Buying Guide": ["Home Selection Tips", "Home Buying Process", "Mortgage Application"],
"Interior Design": ["Decorating Styles", "Decorating Materials", "Decorating Budget"],
"Home Living": ["Home Arrangement", "Home Maintenance", "Smart Homes"],
"Real Estate Policies": ["Policy Updates", "Policy Interpretation", "Policy Impact"],
"Rental Market": ["Rental Process", "Rental Agreements", "Rental Tips"]
},
"Travel and Adventure": {
"Domestic Travel": ["Destination Recommendations", "Domestic Travel Guides", "Travel Safety"],
"International Travel": ["Visa Applications", "International Travel Guides", "Cultural Adaptation"],
"Outdoor Adventures": ["Hiking", "Mountain Climbing", "Wilderness Survival Skills"],
"Travel Guides": ["Travel Planning", "Travel Budget", "Travel Packing Lists"],
"Travel Equipment": ["Backpack Selection", "Outdoor Gear", "Travel Essentials"],
"Travel Photography": ["Photography Tips", "Travel Photography Works", "Photography Equipment Recommendations"]
},
"Food and Cooking": {
"Food Recommendations": ["Local Delicacies", "Food Rankings", "Restaurant Recommendations"],
"Cooking Skills": ["Basic Cooking", "Creative Cooking", "Cooking Tool Usage"],
"Ingredient Selection": ["Ingredient Selection Tips", "Seasonal Ingredients", "Organic Ingredients"],
"Food Culture": ["Food Culture", "Local Food Customs", "Dietary Health"],
"Healthy Eating": ["Balanced Nutrition", "Healthy Recipes", "Dietary Wellness"],
"Baking and Desserts": ["Dessert Making", "Baking Skills", "Dessert Ingredients"]
},
"Culture and Arts": {
"Literature": ["Literary Works", "Literary Criticism", "Creative Writing Skills"],
"Music": ["Music Styles", "Music Production", "Music Appreciation"],
"Painting": ["Painting Techniques", "Painting Schools", "Painting Appreciation"],
"Sculpture": ["Sculpture Art", "Sculpture Creation", "Sculpture Materials"],
"Theater": ["Theater Performance", "Theater Creation", "Theater History"],
"Film": ["Film Recommendations", "Film Reviews", "Film Production"]
},
"Sports and Fitness": {
"Sports Events": ["Event Broadcasts", "Event Analysis", "Event History"],
"Fitness Methods": ["Fitness Tutorials", "Fitness Plans", "Fitness Diet"],
"Sports Equipment": ["Equipment Recommendations", "Equipment Usage", "Equipment Maintenance"],
"Sports Celebrities": ["Celebrity Introductions", "Celebrity Interviews", "Celebrity Events"],
"Sports Policies": ["Policy Interpretation", "Policy Impact", "Policy Updates"],
"Sports Industry": ["Industry Trends", "Industry Investment", "Industry Cases"]
},
"Military and National Defense": {
"Military News": ["News Reports", "News Analysis", "Military Updates"],
"Defense Technology": ["Technology Advancements", "Technology Applications", "Innovative Technologies"],
"Weapons and Equipment": ["Equipment Introduction", "Equipment Comparison", "Equipment Maintenance"],
"Military History": ["Historical Events", "Historical Battles", "Historical Figures"],
"Military Service System": ["Service Regulations", "Enlistment Process", "Veterans' Policies"],
"National Security": ["Security Policies", "Security Education", "Security Awareness"]
},
"Social Welfare": {
"Charity Donations": ["Donation Channels", "Donation Impact", "Donation Stories"],
"Volunteer Services": ["Service Projects", "Service Training", "Volunteer Stories"],
"Public Welfare Activities": ["Activity Organization", "Activity Participation", "Activity Impact"],
"Public Welfare Organizations": ["Organization Introductions", "Organization Activities", "Organization Cooperation"],
"Social Assistance": ["Assistance Targets", "Assistance Methods", "Assistance Policies"],
"Spreading Love": ["Spreading Methods", "Spreading Activities", "Spreading Impact"]
},
"Automotive and Transportation": {
"Automotive News": ["New Car Releases", "Car Reviews", "Automotive Trends"],
"Driving Skills": ["Safe Driving", "Fuel-Efficient Driving", "Driver Training"],
"Vehicle Maintenance": ["Routine Maintenance", "Fault Diagnosis", "Repair Services"],
"Traffic Laws": ["Law Interpretation", "Safety Education", "Law Updates"],
"New Energy Vehicles": ["Technical Features", "Market Dynamics", "Policy Support"],
"Smart Transportation": ["Technology Applications", "Smart Systems", "Future Trends"]
},
"E-commerce": {
"Online Shopping": ["Shopping Guides", "User Reviews", "Promotions"],
"E-commerce Operations": ["Operations Management", "Market Analysis", "Customer Service"],
"Cross-border E-commerce": ["International Logistics", "Tariff Policies", "Market Analysis"],
"E-commerce Policies": ["Policy Interpretation", "Policy Impact", "Compliance Operations"],
"E-commerce Marketing": ["Marketing Strategies", "Advertising Placement", "User Analysis"],
"E-commerce Logistics": ["Logistics Delivery", "Inventory Management", "Logistics Technology"]
},
"Gaming and Animation": {
"Online Games": ["Popular Games", "Game Reviews", "Gaming Communities"],
"Single-player Games": ["Classic Games", "Game Guides", "Game Recommendations"],
"Animation Works": ["Popular Anime", "Anime Characters", "Anime Production"],
"Game Guides": ["Guide Sharing", "Skill Exchange", "Guide Videos"],
"Animation Industry": ["Industry Trends", "Market Analysis", "Industry Policies"],
"Game Merchandise": ["Merchandise Products", "Collecting Guides", "Merchandise Events"]
},
"Infant and Child Education": {
"Early Education": ["Educational Philosophy", "Educational Methods", "Educational Toys"],
"Maternal and Infant Care": ["Care Knowledge", "Care Skills", "Care Products"],
"Child Psychology": ["Psychological Development", "Emotion Management", "Psychological Counseling"],
"Parent-child Relationship": ["Parent-child Activities", "Parent-child Communication", "Parent-child Education"],
"Baby Products": ["Product Selection", "Safety Standards", "Product Recommendations"],
"Child Health": ["Healthy Growth", "Nutritional Diet", "Disease Prevention"]
},
"Senior Life": {
"Elderly Care Policies": ["Policy Interpretation", "Policy Consultation", "Policy Implementation"],
"Senior Health": ["Health Checkups", "Disease Prevention", "Healthy Eating"],
"Senior Activities": ["Cultural Activities", "Sports Activities", "Social Activities"],
"Senior Psychology": ["Psychological Adjustment", "Psychological Health", "Psychological Support"],
"Elderly Care Institutions": ["Institution Selection", "Service Quality", "Institution Evaluation"],
"Senior Products": ["Assistance Products", "Health Products", "Living Products"]
},
"Psychological Counseling": {
"Mental Health": ["Mental Maintenance", "Mental Problem Prevention", "Mental Health Education"],
"Psychological Disorders": ["Disorder Identification", "Disorder Treatment", "Disorder Management"],
"Counseling Skills": ["Counseling Methods", "Communication Skills", "Case Studies"],
"Psychological Tests": ["Test Types", "Test Applications", "Test Interpretation"],
"Psychological Research": ["Research Trends", "Research Methods", "Research Results"],
"Psychological Guidance": ["Guidance Strategies", "Guidance Cases", "Guidance Resources"]
},
"Religion and Belief": {
"Religious Culture": ["Cultural Traditions", "Cultural Festivals", "Cultural Influence"],
"Religious History": ["Historical Development", "Key Events", "Historical Figures"],
"Religious Art": ["Art Forms", "Art Works", "Art Value"],
"Religious Policies": ["Policy Regulations", "Policy Interpretation", "Policy Impact"],
"Religious Activities": ["Activity Organization", "Activity Participation", "Activity Significance"],
"Faith Discussions": ["Meaning of Faith", "Faith Conflicts", "Faith Diversity"]
},
"Agriculture and Rural Development": {
"Agricultural Technology": ["Technology Applications", "Technological Innovation", "Technology Promotion"],
"Rural Development": ["Development Planning", "Development Models", "Development Cases"],
"Farmer Life": ["Life Improvement", "Quality of Life", "Living Customs"],
"Agricultural Products Market": ["Market Analysis", "Market Trends", "Market Transactions"],
"Agricultural Policies": ["Policy Support", "Policy Interpretation", "Policy Implementation"],
"Rural Tourism": ["Tourism Development", "Tourism Projects", "Tourism Experience"]
},
"Urban Planning": {
"Urban Planning": ["Planning Philosophy", "Planning Methods", "Planning Cases"],
"Urban Design": ["Design Philosophy", "Design Elements", "Design Practice"],
"Infrastructure Development": ["Development Planning", "Development Management", "Development Technology"],
"Urban Transportation": ["Transportation Planning", "Transportation Management", "Transportation Tools"],
"Urban Greening": ["Greening Layout", "Greening Technology", "Greening Effects"],
"Protection of Historic Cities": ["Protection Policies", "Protection Measures", "Protection Cases"]
},
"Laws and Regulations": {
"Civil Law": ["General Principles", "Property Law", "Contract Law"],
"Criminal Law": ["General Principles", "Types of Crimes", "Punishment Systems"],
"Administrative Law": ["Administrative Regulations", "Administrative Litigation", "Administrative Reconsideration"],
"Economic Law": ["Corporate Law", "Tax Law", "Intellectual Property Law"],
"International Law": ["Public International Law", "Private International Law", "International Trade Law"],
"Legal Consultation": ["Consultation Services", "Legal Aid", "Legal Education"]
},
"Art": {
"Painting": ["Painting Techniques", "Painting Styles", "Painting Works"],
"Sculpture": ["Sculpture Materials", "Sculpture Styles", "Sculpture Creation"],
"Design": ["Design Philosophy", "Design Methods", "Design Works"],
"Photography": ["Photography Techniques", "Photography Themes", "Photography Works"],
"Calligraphy": ["Calligraphy Art", "Calligraphy Styles", "Calligraphy Works"],
"Handicrafts": ["Craft Making", "Craft Materials", "Craft Culture"]
},
"Marketing": {
"Market Research": ["Research Methods", "Research Tools", "Research Reports"],
"Marketing Strategies": ["Strategy Formulation", "Strategy Execution", "Strategy Evaluation"],
"Brand Management": ["Brand Positioning", "Brand Promotion", "Brand Maintenance"],
"Advertising": ["Creative Advertising", "Advertising Media", "Advertising Effectiveness"],
"Public Relations": ["Event Planning", "Event Execution", "Event Evaluation"],
"Channel Development": ["Channel Expansion", "Channel Management", "Channel Optimization"]
},
"Astronomy and Geography": {
"Astronomy": ["Astronomical Observations", "Astronomical Phenomena", "Astronomical Research"],
"Geography": ["Geographical Knowledge", "Geographical Exploration", "Geographical Education"],
"Geology": ["Geological Structure", "Geological Survey", "Geological Protection"],
"Meteorology": ["Weather Forecasting", "Weather Disasters", "Weather Services"],
"Space Exploration": ["Space Exploration", "Interstellar Travel", "Extraterrestrial Life"],
"Geographical Information Systems": ["GIS Technology", "GIS Applications", "GIS Development"]
},
"Education and Exams": {
"College Entrance Exam Coaching": ["Preparation Strategies", "Practice Tests", "Exam Policy Interpretation"],
"Graduate School Entrance Exam Coaching": ["Preparation Planning", "Specialty Coaching", "Psychological Adjustment"],
"Civil Service Exams": ["Exam Techniques", "Essay Writing Guidance", "Interview Preparation"],
"Teaching Qualification Exams": ["Exam Process", "Interview Skills", "Teaching Ability Improvement"],
"Foreign Language Exams": ["CET-4/CET-6", "IELTS/TOEFL", "Foreign Language Speaking Training"],
"Professional Qualification Exams": ["Exam Subjects", "Career Development", "Qualification Certification"]
},
"Cybersecurity": {
"Cybersecurity Protection": ["Protection Measures", "Security Tools", "Protection Strategies"],
"Hacker Attack and Defense": ["Attack and Defense Drills", "Security Vulnerabilities", "Hacking Techniques"],
"Data Encryption": ["Encryption Technology", "Data Protection", "Encryption Strategies"],
"Information Leak Prevention": ["Leakage Risks", "Prevention Measures", "Emergency Response"],
"Cybersecurity Policies": ["Policy Interpretation", "Regulations and Standards", "Policy Updates"],
"Cybersecurity Incidents": ["Incident Analysis", "Incident Tracking", "Incident Prevention"]
},
"Fashion and Trends": {
"Clothing Matching": ["Everyday Outfits", "Dressing for Occasions", "Fashion Trends"],
"Beauty and Skincare": ["Skincare Knowledge", "Makeup Skills", "Beauty Products"],
"Fashion Accessories": ["Jewelry Matching", "Accessory Selection", "Trendy Accessories"],
"Trend Analysis": ["Fashion Week", "Trend Analysis", "Trend Forecasting"],
"Fashion Bloggers": ["Blogger Recommendations", "Blogger Styles", "Blogger Influence"],
"Fashion Brands": ["Brand Stories", "Brand Series", "Brand Events"]
},
"Mental Health": {
"Emotion Management": ["Emotion Recognition", "Emotion Regulation", "Emotion Expression"],
"Stress Management": ["Stress Sources", "Stress Relief Techniques", "Stress Management"],
"Interpersonal Relationships": ["Communication Skills", "Conflict Resolution", "Social Skills"],
"Self-Awareness": ["Self-Exploration", "Self-Evaluation", "Personal Growth"],
"Psychological Adjustment": ["Adjustment Methods", "Psychological Balance", "Psychological Resilience"],
"Psychological Disorder Prevention": ["Disorder Knowledge", "Prevention Measures", "Health Promotion"]
},
"Agricultural Technology": {
"Smart Agriculture": ["Smart Technology", "Precision Agriculture", "Agricultural Big Data"],
"Agricultural Mechanization": ["Mechanization Applications", "Technological Innovation", "Mechanization Maintenance"],
"Agricultural Product Processing": ["Processing Technology", "Product Innovation", "Quality Control"],
"Agricultural Innovation": ["Innovation Cases", "Innovation Policies", "Innovation-Driven Development"],
"Agricultural Policies": ["Policy Support", "Policy Interpretation", "Policy Implementation"],
"Agricultural Market Analysis": ["Market Trends", "Demand Analysis", "Price Fluctuations"]
},
"Digital Products": {
"Smartphone Reviews": ["Performance Testing", "User Experience", "New Releases"],
"Computer Hardware": ["Hardware Configuration", "Hardware Upgrades", "Hardware Maintenance"],
"Digital Cameras": ["Camera Selection", "Photography Tips", "Camera Maintenance"],
"Wearable Devices": ["Device Functions", "Health Monitoring", "Smart Interactions"],
"Routers": ["Router Setup", "Signal Optimization", "Network Security"],
"Digital Accessories": ["Accessory Selection", "Device Protection", "Accessory Recommendations"]
},
"Home Decoration": {
"Decoration Styles": ["Modern Minimalism", "Classical Chinese Style", "Luxurious European Style"],
"Decoration Materials": ["Material Selection", "Material Environmental Protection", "Material Costs"],
"Interior Design": ["Space Planning", "Furniture Selection", "Color Matching"],
"Soft Decoration": ["Curtain Selection", "Bedding Matching", "Decorative Paintings"],
"Feng Shui": ["Feng Shui Layout", "Feng Shui Taboos", "Feng Shui Improvements"],
"Renovation Construction": ["Construction Process", "Construction Supervision", "Construction Safety"]
},
"History and Culture": {
"Chinese History": ["Ancient History", "Modern History", "History Education"],
"World History": ["Origins of Civilization", "Historical Events", "International Relations"],
"Archaeological Discoveries": ["Site Excavation", "Cultural Relic Protection", "Archaeological Techniques"],
"Historical Figures": ["Biographies", "Character Evaluations", "Historical Impact"],
"Cultural Heritage": ["Heritage Protection", "Heritage Value", "Heritage Inheritance"],
"Historical Research": ["Research Methods", "Academic Achievements", "Research Trends"]
},
"Travel Guides": {
"Independent Travel Guides": ["Destination Recommendations", "Itinerary Planning", "Accommodation Selection"],
"Group Travel Guides": ["Tour Agency Selection", "Group Activities", "Group Travel Advantages"],
"Tourism Route Planning": ["Route Design", "Special Routes", "Theme Travel"],
"Money-Saving Travel Tips": ["Budget Planning", "Spending Guides", "Discount Information"],
"Travel Safety": ["Safety Tips", "Emergency Handling", "Insurance Selection"],
"Travel Visas": ["Visa Applications", "Visa Policies", "Visa Documentation"]
},
"Food Sharing": {
"Recipe Sharing": ["Recipe Sharing", "Cooking Skills", "Ingredient Selection"],
"Food Recommendations": ["Special Dishes", "Local Snacks", "Restaurant Recommendations"],
"Food Exploration": ["Exploration Guides", "Shop Reviews", "Food Maps"],
"Food Photography": ["Photography Skills", "Food Presentation", "Visual Display"],
"Food Reviews": ["Dish Reviews", "Restaurant Reviews", "Ingredient Reviews"],
"Food Competitions": ["Competition Information", "Participation Guidelines", "Award-Winning Works"]
},
"Film and Entertainment": {
"Movie Recommendations": ["New Movie Alerts", "Classic Movies", "Movie Rankings"],
"TV Series Reviews": ["Popular Drama Reviews", "Series Recommendations", "Plot Analysis"],
"Variety Show Reviews": ["Program Highlights", "Guest Performances", "Program Creativity"],
"Online Series": ["Popular Online Series", "Online Series Production", "Online Series Trends"],
"Short Videos": ["Short Video Creation", "Short Video Platforms", "Short Video Marketing"],
"Film Production": ["Production Process", "Behind the Scenes", "Production Techniques"]
},
"Sports Activities": {
"Ball Sports": ["Football", "Basketball", "Volleyball"],
"Track and Field": ["Running", "Long Jump", "Throwing"],
"Water Sports": ["Swimming", "Rowing", "Surfing"],
"Winter Sports": ["Skiing", "Ice Skating", "Sledding"],
"Extreme Sports": ["Rock Climbing", "Skydiving", "Extreme Cycling"],
"Sports Events": ["International Events", "Domestic Events", "Local Events"]
},
"Entrepreneurship and Investment": {
"Entrepreneurship Guidance": ["Entrepreneurship Plans", "Market Analysis", "Entrepreneurship Mindset"],
"Investment and Finance": ["Investment Strategies", "Asset Management", "Risk Control"],
"Entrepreneurship Policies": ["Policy Interpretation", "Policy Support", "Policy Utilization"],
"Entrepreneurship Cases": ["Success Stories", "Lessons Learned", "Case Analysis"],
"Venture Capital": ["Investment Opportunities", "Investment Evaluation", "Investment Negotiation"],
"Entrepreneurship Financing": ["Financing Channels", "Financing Strategies", "Financing Agreements"]
},
"Music and Dance": {
"Music Appreciation": ["Music Styles", "Music Works", "Musicians"],
"Instrumental Performance": ["Instrument Selection", "Performance Techniques", "Instrument Maintenance"],
"Dance Performance": ["Dance Types", "Performance Techniques", "Performance Opportunities"],
"Music Production": ["Music Creation", "Music Recording", "Music Publishing"],
"Music Education": ["Education Methods", "Educational Resources", "Education Policies"],
"Dance Choreography": ["Choreography Techniques", "Choreography Creativity", "Choreography Practice"]
},
"National Defense and Military": {
"Military Strategy": ["Strategy Analysis", "Strategy Planning", "Strategy Implementation"],
"Military Training": ["Basic Training", "Tactical Training", "Special Forces Training"],
"Weapons Development": ["Equipment Introduction", "Research and Development Updates", "Technological Innovation"],
"Military History": ["Historical Battles", "Historical Figures", "Historical Events"],
"National Defense Education": ["Educational Content", "Educational Methods", "Educational Significance"],
"Military Exercises": ["Exercise Types", "Exercise Scale", "Exercise Objectives"]
}
}
# 任务类型(增强场景多样性,参考论文中的常见交互场景)
self.task_types = [
"Daily Conversation",
"Creative Task",
"Role Playing",
"Problem Solving",
"Educational Explanation",
"Emotional Support",
"Information Retrieval"
]
def build_prompt(self, theme, domain):
"""
Generates the formatted prompt for LLM input based on the theme and domain.
Parameters:
theme (str): The main theme of the questions.
domain (str): The domain under the given theme.
Returns:
str: The formatted prompt for generating questions.
"""
prompt = f"""
Now we need to create high-quality SFT data for LLM training, so we need you to produce a batch of such data. You only
need to create Questions. I will give you a theme for SFT data Questions. You need to create three
Questions of different difficulty levels based on this new theme.\\
Your Questions must meet the following requirements:\\
1. You must strictly create only three Questions at a time. These three Questions must be in the domain of {domain}
and the Questions should align with the given theme of {theme}.\\
2. The Questions you create must have context and sufficient information; they should not be abrupt and directly ask the
question.\\
3. Your reply must strictly follow the format below. Your Questions need to be included between [Question Start] and
[Question End], and the difficulty level should be indicated at the beginning, as in the following format:\\
[Easy][Question Start]Question[Question End]
[Medium][Question Start]Question[Question End]
[Hard][Question Start]Question[Question End]
4. Your Questions of different difficulty levels should be distinct and actually reflect the different levels of difficulty.\\
\quad \\
Now it's your turn. Please provide the three Questions of different difficulty levels you created about the theme of {theme} for {domain}, according to the requirements.
"""
return prompt
@PROMPT_REGISTRY.register()
class CondorCritiquePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, question, answer):
dialogue = [question, answer]
base_critique_prompt = f"""
There is now a user’s question and a model’s response. You need to write a critique for this response, pointing out the
strengths and weaknesses of the model’s answer to help the model improve its response.
Your critique must strictly adhere to the following format:
[Critique Start]
[Strength Start]Strength[Strength End]
[Weakness Start]Weakness[Weakness End]
[Suggestion Start]Suggestion[Suggestion End]
[Critique End]
Here is the user’s question and the model’s response: {dialogue}
Now it’s your turn. Please provide your Critique as required:
"""
return base_critique_prompt
@PROMPT_REGISTRY.register()
class CondorRefinePrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, question, answer, critique):
base_refine_prompt = """
Now there is a user's question, a model's answer, and the user's feedback. Please help modify the model's answer based on the user's feedback to make it better.
Your improved answer must strictly adhere to the following format:
[Improved Answer Start]Your answer[Improved Answer End]
Below is the user's question, the model's answer, and the feedback:
[Question Start]{question}[Question End]
[Answer Start]{answer}[Answer End]
[Feedback Start]{critique}[Feedback End]
Now it's your turn, please provide your improved answer as required:
"""
return base_refine_prompt.format(question=question, answer=answer, critique=critique)
@PROMPT_REGISTRY.register()
class LanguageFilterPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, text):
prompt='''You are a language identification expert. Your task is to identify the language of the given text input.
Follow these rules:You are a language identification expert. Your task is to identify the language of the given text input.
- Respond with the ISO 639-1 two-letter language code (e.g., "en", "fr", "zh", "ar").
- If the text contains multiple languages, identify the dominant one.
- If the language is not identifiable, respond with "Unknown".
- Do not translate or explain. Output only the language name.
Here are some examples:
Example 1:
Text: "Hello, how are you?"
Language: en
Example 2:
Text: "Je suis très heureux de vous rencontrer."
Language: fr
Example 3:
Text: "これは日本語の文です。"
Language: ja
Example 4:
Text: "¿Dónde está la estación de tren?"
Language: es
Example 5:
Text: "مرحبا، كيف حالك؟"
Language: ar
Example 6:
Text: "Guten Morgen! Wie geht's dir?"
Language: de
Example 7:
Text: "你好,我是一个程序员。"
Language: zh
Example 8:
Text: "Привет, как дела?"
Language: ru
Now, identify the language of the following text:
Text: "{text}"
Language:
'''
return prompt.format(text=text)
@PROMPT_REGISTRY.register()
class SFTFromScratchGeneratorPrompt(PromptABC):
"""
Prompt for generating brand-new SFT samples from scratch.
"""
def __init__(self):
pass
def build_prompt(self, domain_keys: str) -> str:
system_prompt = """You are a sophisticated data generation assistant specialized in creating high-quality Supervised Fine-Tuning (SFT) datasets for large language models.
Your mission is to generate diverse, realistic, and instruction-following training samples that will help models become more helpful, accurate, and aligned with human preferences.
## Core Principles:
**1. Structural Excellence:**
- instruction: Clear, specific, and actionable user request
- input: Contextual information when relevant (empty string if none needed)
- output: Comprehensive, accurate, and genuinely helpful response
- domain: Single domain classification from the provided taxonomy
**2. Quality Standards:**
- Responses must be factually accurate and demonstrate expertise
- Use natural, conversational language appropriate to the context
- Provide complete solutions that fully address the instruction
- Maintain consistency between instruction complexity and response depth
- Include relevant examples, explanations, or step-by-step guidance when beneficial
**3. Diversity Requirements:**
- Vary instruction phrasing and complexity levels
- Mix different user personas and contexts
- Include both simple and complex scenarios within each domain
- Generate instructions that reflect real-world use cases
**4. Safety & Ethics:**
- No harmful, illegal, discriminatory, or misleading content
- Respect privacy and avoid generating personal information
- Maintain neutrality on controversial topics
- Provide balanced perspectives when appropriate
**5. Technical Format:**
- Output valid JSON in a single line with no formatting
- Properly escape special characters in strings
- Ensure all required fields are present and correctly typed"""
user_prompt = f"""Generate ONE premium-quality SFT training sample as a single-line JSON object.
## Requirements:
- **instruction**: A realistic user request that varies in style, complexity, and specificity
- **input**: Additional context when it enhances the scenario (otherwise empty string)
- **output**: A comprehensive, expert-level response that fully satisfies the instruction
- **domain**: Select the most appropriate domain from: {domain_keys}
## Quality Checklist:
✓ Instruction is clear and represents authentic user needs
✓ Response demonstrates expertise and provides genuine value
✓ Appropriate level of detail for the complexity of the request
✓ Natural, human-like language throughout
✓ Perfect JSON formatting in a single line
## Diversity Goals:
- Mix formal/informal language styles
- Include various difficulty levels and user contexts
- Represent different cultural perspectives when relevant
- Balance theoretical knowledge with practical applications
## Format Example:
{{"instruction": "Create a Python function that calculates compound interest with error handling", "input": "", "output": "def compound_interest(principal, rate, time, n=1):\\n if principal <= 0 or rate < 0 or time < 0 or n <= 0:\\n raise ValueError('Invalid input: principal must be positive, rate and time non-negative, n positive')\\n return principal * (1 + rate/n)**(n*time)\\n\\n# Example usage:\\n# result = compound_interest(1000, 0.05, 2, 4)", "domain": "coding"}}
Output only the JSON - no explanations or additional text."""
return system_prompt + "\n\n" + user_prompt
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
import re
@PROMPT_REGISTRY.register()
class KnowledgeCleanerPrompt(PromptABC):
'''
知识清洗提示词生成器,支持中英文多语言适配
Specialized in refining raw content with multilingual support.
'''
def __init__(self, lang: str = "en", strict_mode: bool = True):
self.lang = lang
self.strict_mode = strict_mode
def build_prompt(self, raw_content: str) -> str:
"""生成知识清洗的思维链提示词(保持原有格式)"""
if self.lang == "en":
self.prompt_header = f"""
You are a meticulous Knowledge Refinement Engineer. Apply these rules STRICTLY:
1. Remove redundant tags but retain:
- Semantic tags like <table>, <code>
- Meaningful attributes
2. Normalize special characters:
- Standardize quotes and dashes
- Convert ellipsis (...)
3. URL handling:
- Preserve footnote URLs
- Extract display texts
4. Text structure:
- Maintain paragraph/list breaks
- Keep code indentation
- Limit empty lines (max=2)
5. Reference processing (NEW):
- Images → "[Image: alt_text]"
- Signatures → "[Signature]"
6. Code blocks: {"(strict)" if self.strict_mode else ""}
- {"Force closure" if self.strict_mode else "Preserve raw"}
- Mark fragments as /*...*/
7. Absolute fidelity:
- NO fact/number modifications
- NO term paraphrasing
- NO table structure changes
8. Security Processing (NEW):
- PII: Phone/ID/Email must be masked, e.g.
Original: phone 13800138000 → Processed: phone 138****8000
- Classified: Mark 【Confidential】as 〖SEC∶classified〗
- Illegal: Replace sensitive content with 〖ILLEGAL∶removed〗
- Encryption tags: Use 〖〗for encrypted sections
Example:
Input:
<div class="article">
<h1>Knowledge Cleaning™</h1>
<figure>
<img src="process.png" alt="Cleaning Flowchart" title="Three Phases">
<figcaption>Fig.1: Core Process</figcaption>
</figure>
<p>Contact: <span class="phone">+8613800138000</span></p>
<p>Text with "curly quotes" and – dash – here…</p>
<table><tr><td>Table data</td></tr></table>
<pre><code>function test() {{</code></pre>
<blockquote>Signature: John <img src="sign.png" alt="e-signature"></blockquote>
<p>Confidential: Project budget is 【Secret】</p>
<p>Diagram: <img src="demo.jpg" class="diagram"></p>
</div>
Output:
<cleaned_start>
Knowledge Cleaning™
[Image: Cleaning Flowchart (Three Phases) Fig.1: Core Process]
Contact: +86*****8000
Text with "straight quotes" and - dash - here...
<table><tr><td>Table data</td></tr></table>
<code>function test() {{ /*...*/ }}</code>
[Signature]Signature: John [Image: e-signature]
〖SEC∶classified content〗
Diagram: [Image: Diagram demo.jpg]
<cleaned_end>
"""
else:
self.prompt_header =f"""
你是一名严谨的知识清洗工程师。请严格按照以下规则处理原始内容:
1. 移除冗余HTML/XML标签,但保留:
- 语义化标签如 <table>、<code>、<formula>
- 所有携带意义的属性值
2. 规范化特殊字符:
- 将花引号(“ ” ‘ ’)转为标准引号(" ")
- 将长破折号(– —)替换为短横线(-)
- 中文省略号(…)转为英文省略号(...)
- 保留数学符号和技术记号(如<<、>>等操作符)
3. 链接处理:
- 脚注/参考文献中的URL保持原样
- 移除超链接包装但保留显示文本
示例:<a href="https://example.com">示例</a> → 示例
4. 文本结构:
- 保持原始段落/列表的换行
- 保留代码/引用的缩进层级
- 压缩连续空行为最多2行
5. 引用内容处理(新增):
- 图片引用转换为【引用图片:描述文本】
- 签名区块标记为【签名引用】
6. 代码块处理:{"(严格模式)" if self.strict_mode else ""}
- {"确保代码块闭合(如补全缺失的括号)" if self.strict_mode else "保持代码原样"}
- 标记不完整代码为/*...*/
7. 绝对保真:
- 禁止增删任何事实、数字或命名实体
- 禁止改写专业术语或专有名词
- 禁止修改表格数据结构
8. 安全处理(新增):
- 个人隐私:身份证号/手机号/邮箱等需脱敏,示例:
原文本:电话 13800138000 → 处理后:电话 138****8000
- 涉密内容:检测到【机密】【秘密】等关键词时,整句替换为【涉密内容已加密】
- 违规信息:政治敏感、暴恐等内容替换为【违规内容已屏蔽】
- 加密标记:使用〖〗包裹加密区域,示例:
〖PII∶身份证号〗〖SEC∶机密字段〗
示例:
输入:
<div class="article">
<h1>知识清洗®</h1>
<figure>
<img src="process.png" alt="清洗流程图" title="三阶段处理">
<figcaption>图1:核心流程</figcaption>
</figure>
<p>联系电话:<span class="phone">13800138000</span></p>
<p>这是包含"花引号"和—破折号—的文本…</p>
<table><tr><td>表格数据</td></tr></table>
<pre><code>function test() {{</code></pre>
<blockquote>签名:张三 <img src="sign.png" alt="电子签名"></blockquote>
<p>机密信息:本项目预算为【机密】</p>
<p>示意图:<img src="demo.jpg" class="diagram"></p>
</div>
输出:
<cleaned_start>
知识清洗®
[引用图片:清洗流程图(三阶段处理)图1:核心流程]
联系电话:138****8000
这是包含"花引号"和-破折号-的文本...
<table><tr><td>表格数据</td></tr></table>
<code>function test() {{ /*...*/ }}</code>
[签名引用]签名:张三 [引用图片:电子签名]
涉密内容已加密
示意图:[引用图片:示意图demo.jpg]
<cleaned_end>
"""
if self.lang == "en":
processing_steps = """
Processing Steps:
1. [Tag Analysis] Classify markup tags
2. [Reference Extraction] Isolate images/tables
3. [Character Audit] Log special chars
4. [Structure Check] Validate hierarchy
5. [Final Output] Generate cleaned text
""".strip()
output_requirement = 'Response must contain ONLY cleaned text between <cleaned_start> and <cleaned_end>.'
else:
processing_steps = """
处理步骤:
1. [标签分析] 识别并分类所有标记标签
2. [引用提取] 分离图片/表格/签名等引用内容
3. [字符审核] 记录特殊字符变更
4. [结构检查] 验证文本层级
5. [最终输出] 生成清洗后文本
""".strip()
output_requirement = '响应必须只包含清洗后文本,以<cleaned_start>开头,<cleaned_end>结尾,无其他内容。'
return f"""
{self.prompt_header}
待清洗内容:
{raw_content}
{processing_steps}
{output_requirement}
""".strip()
def _post_process(self, cleaned_text: str) -> str:
"""后处理逻辑(新增引用校验)"""
if self.strict_mode:
# 校验引用标记完整性
cleaned_text = re.sub(r'(!$$.*?$$)$.+?$',
lambda m: f"【引用图片:{m.group(1)[2:-1]}" if "图片" in m.group(1) else m.group(0),
cleaned_text)
return cleaned_text
@PROMPT_REGISTRY.register()
class MathbookQuestionExtractPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self):
PROMPT = """You are given a collection of images:
• page_n.jpg – the n-th page of a math textbook
• page_n+1.jpg – the (n+1)-th page of the same book
• index.jpg files (e.g. 1.jpg, 2.jpg, …) – all figures, diagrams or illustrations appearing on those two pages
Your task:
1. Extract every exercise (math problem) that has at least one line or element on page_n.jpg. You should extract the problem in its original language, do not translate it.
2. If a problem is split across page_n.jpg and page_n+1.jpg, include it in full (using page_n+1.jpg only to complete it).
3. Do not extract any problem that appears exclusively on page_n+1.jpg.
4. For each extracted problem, locate any referenced figures among the index.jpg files and insert the exact filename in <image>...</image> (for example <image>3.jpg</image>) at the correct place in the problem text.
5. Return all extracted problems concatenated into one string, using the literal token <SPACE> to separate them. For example:
PROBLEM_TEXT_1<SPACE>PROBLEM_TEXT_2<SPACE>PROBLEM_TEXT_3
6. If no qualifying problems are found on page_n.jpg, return two consecutive spaces: "<SPACE><SPACE>".
Ensure that figure tags exactly match the provided image filenames and that no extra separators or punctuation are added.
"""
return PROMPT
# self._init_prompt_header()
# def _init_prompt_header(self):
# """根据语言初始化提示词头部模板"""
# if self.lang == "en":
# self.prompt_header = f"""
# You are a meticulous Knowledge Refinement Engineer. Apply these rules STRICTLY:
# 1. Remove redundant tags but retain:
# - Semantic tags like <table>, <code>
# - Meaningful attributes
# 2. Normalize special characters:
# - Standardize quotes and dashes
# - Convert ellipsis (...)
# 3. URL handling:
# - Preserve footnote URLs
# - Extract display texts
# 4. Text structure:
# - Maintain paragraph/list breaks
# - Keep code indentation
# - Limit empty lines (max=2)
# 5. Reference processing (NEW):
# - Images → "[Image: alt_text]"
# - Signatures → "[Signature]"
# 6. Code blocks: {"(strict)" if self.strict_mode else ""}
# - {"Force closure" if self.strict_mode else "Preserve raw"}
# - Mark fragments as /*...*/
# 7. Absolute fidelity:
# - NO fact/number modifications
# - NO term paraphrasing
# - NO table structure changes
# 8. Security Processing (NEW):
# - PII: Phone/ID/Email must be masked, e.g.
# Original: phone 13800138000 → Processed: phone 138****8000
# - Classified: Mark 【Confidential】as 〖SEC∶classified〗
# - Illegal: Replace sensitive content with 〖ILLEGAL∶removed〗
# - Encryption tags: Use 〖〗for encrypted sections
# Example:
# Input:
# <div class="article">
# <h1>Knowledge Cleaning™</h1>
# <figure>
# <img src="process.png" alt="Cleaning Flowchart" title="Three Phases">
# <figcaption>Fig.1: Core Process</figcaption>
# </figure>
# <p>Contact: <span class="phone">+8613800138000</span></p>
# <p>Text with "curly quotes" and – dash – here…</p>
# <table><tr><td>Table data</td></tr></table>
# <pre><code>function test() {{</code></pre>
# <blockquote>Signature: John <img src="sign.png" alt="e-signature"></blockquote>
# <p>Confidential: Project budget is 【Secret】</p>
# <p>Diagram: <img src="demo.jpg" class="diagram"></p>
# </div>
# Output:
# <cleaned_start>
# Knowledge Cleaning™
# [Image: Cleaning Flowchart (Three Phases) Fig.1: Core Process]
# Contact: +86*****8000
# Text with "straight quotes" and - dash - here...
# <table><tr><td>Table data</td></tr></table>
# <code>function test() {{ /*...*/ }}</code>
# [Signature]Signature: John [Image: e-signature]
# 〖SEC∶classified content〗
# Diagram: [Image: Diagram demo.jpg]
# <cleaned_end>
# """
# else:
# self.prompt_header =f"""
# 你是一名严谨的知识清洗工程师。请严格按照以下规则处理原始内容:
# 1. 移除冗余HTML/XML标签,但保留:
# - 语义化标签如 <table>、<code>、<formula>
# - 所有携带意义的属性值
# 2. 规范化特殊字符:
# - 将花引号(“ ” ‘ ’)转为标准引号(" ")
# - 将长破折号(– —)替换为短横线(-)
# - 中文省略号(…)转为英文省略号(...)
# - 保留数学符号和技术记号(如<<、>>等操作符)
# 3. 链接处理:
# - 脚注/参考文献中的URL保持原样
# - 移除超链接包装但保留显示文本
# 示例:<a href="https://example.com">示例</a> → 示例
# 4. 文本结构:
# - 保持原始段落/列表的换行
# - 保留代码/引用的缩进层级
# - 压缩连续空行为最多2行
# 5. 引用内容处理(新增):
# - 图片引用转换为【引用图片:描述文本】
# - 签名区块标记为【签名引用】
# 6. 代码块处理:{"(严格模式)" if self.strict_mode else ""}
# - {"确保代码块闭合(如补全缺失的括号)" if self.strict_mode else "保持代码原样"}
# - 标记不完整代码为/*...*/
# 7. 绝对保真:
# - 禁止增删任何事实、数字或命名实体
# - 禁止改写专业术语或专有名词
# - 禁止修改表格数据结构
# 8. 安全处理(新增):
# - 个人隐私:身份证号/手机号/邮箱等需脱敏,示例:
# 原文本:电话 13800138000 → 处理后:电话 138****8000
# - 涉密内容:检测到【机密】【秘密】等关键词时,整句替换为【涉密内容已加密】
# - 违规信息:政治敏感、暴恐等内容替换为【违规内容已屏蔽】
# - 加密标记:使用〖〗包裹加密区域,示例:
# 〖PII∶身份证号〗〖SEC∶机密字段〗
# 示例:
# 输入:
# <div class="article">
# <h1>知识清洗®</h1>
# <figure>
# <img src="process.png" alt="清洗流程图" title="三阶段处理">
# <figcaption>图1:核心流程</figcaption>
# </figure>
# <p>联系电话:<span class="phone">13800138000</span></p>
# <p>这是包含"花引号"和—破折号—的文本…</p>
# <table><tr><td>表格数据</td></tr></table>
# <pre><code>function test() {{</code></pre>
# <blockquote>签名:张三 <img src="sign.png" alt="电子签名"></blockquote>
# <p>机密信息:本项目预算为【机密】</p>
# <p>示意图:<img src="demo.jpg" class="diagram"></p>
# </div>
# 输出:
# <cleaned_start>
# 知识清洗®
# [引用图片:清洗流程图(三阶段处理)图1:核心流程]
# 联系电话:138****8000
# 这是包含"花引号"和-破折号-的文本...
# <table><tr><td>表格数据</td></tr></table>
# <code>function test() {{ /*...*/ }}</code>
# [签名引用]签名:张三 [引用图片:电子签名]
# 涉密内容已加密
# 示意图:[引用图片:示意图demo.jpg]
# <cleaned_end>
# """
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
'''
A collection of prompts for model evaluation.
'''
@PROMPT_REGISTRY.register()
class AnswerJudgePrompt(PromptABC):
"""
用于构建答案评判的提示词模板
"""
def __init__(self):
pass
def build_prompt(self, answer, reference_answer, question=None):
prompt = f"""
As an answer evaluation expert, please assess whether the following answer is correct.
Reference Answer: {reference_answer}
Current Answer: {answer}
Please carefully analyze whether the current answer is semantically consistent with the reference answer.
Focus only on comparing the answers themselves, not on how the problem is solved.
Don't just look at the surface text, understand the essential content of the answers.
If the current answer is semantically consistent with the reference answer, even if expressed differently, it should be judged as correct.
Please return your judgment result in JSON format:
{{"judgement_result": true}} indicates the answer is correct
{{"judgement_result": false}} indicates the answer is incorrect
Your judgment:
"""
return prompt
@PROMPT_REGISTRY.register()
class AnswerJudgePromptQuestion(PromptABC):
"""
用于构建答案评判的提示词模板
"""
def __init__(self):
pass
def build_prompt(self, question, answer, reference_answer):
prompt = f"""
As an answer evaluation expert, please assess whether the following answer is correct.
Question: {question}
Reference Answer: {reference_answer}
Current Answer: {answer}
Please carefully analyze whether the current answer is semantically consistent with the reference answer.
Focus only on comparing the answers themselves, not on how the problem is solved.
Don't just look at the surface text, understand the essential content of the answers.
If the current answer is semantically consistent with the reference answer, even if expressed differently, it should be judged as correct.
Please return your judgment result in JSON format:
{{"judgement_result": true}} indicates the answer is correct
{{"judgement_result": false}} indicates the answer is incorrect
Your judgment:
"""
return prompt
@PROMPT_REGISTRY.register()
class AnswerJudgeMultipleQuestionsPrompt(PromptABC):
"""
用于构建答案评判的提示词模板,支持多个子问题的判断。
"""
def __init__(self):
pass
def build_prompt(self, answer, reference_answer, question=None):
prompt = f"""
As an answer evaluation expert, please assess whether the following answer is correct.
Question: {question}
Reference Answer: {reference_answer}
Current Answer: {answer}
Please carefully analyze whether the current answer is semantically consistent with the reference answer.
Focus only on comparing the answers themselves, not on how the problem is solved.
Don't just look at the surface text, understand the essential content of the answers.
If the current answer is semantically consistent with the reference answer, even if expressed differently, it should be judged as correct.
The question may contain multiple sub-questions (e.g., ①②③ or (a)(b), etc.).
You should first identify the sub-questions in the question, then evaluate the correctness of each corresponding part in the current answer.
You need to provide your reason for each sub-question's judgment.
Your judgement should be a JSON array, where each element is "true" or "false" (use string instead of boolean), indicating whether the answer to each sub-question is correct.
If there is only one question, also return a single-element array.
If the reference answer is incomplete so that you are not able to judge some subquestions, mark the corresponding sub-questions as "empty".
Example:
Question: ① 1+2=? ② What is 2+2? ③ What is 3+3?
Reference Answer: ① 3 ③ 6
Current Answer: ① Three ② Four ③ Seven
Output: {{"reason": "The answer to sub-question 1 is correct as 'Three' is semantically consistent with '3'. The reference answer does not provide information for sub-question 2, so it is marked as 'empty'. The answer to sub-question 3 is incorrect as 'Seven' is not semantically consistent with '6'.", "judgement": ["true", "empty", "false"]}}
Your judgment:
"""
return prompt
\ No newline at end of file
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
@PROMPT_REGISTRY.register()
class VQAExtractPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, example_title, subject: str = "math", interleaved=True) -> str:
PROMPT = ""
if interleaved:
PROMPT = f"""
You are an expert in {subject} competition. You are given an image—page_n—annotated with detected bounding boxes and corresponding labels. Your task is to extract from page_n only:
1. All {subject} problems whose text begins on page_n and the answers with solutions to those problems.
2. If the problem or answer is not complete (because they continue onto page_n+1), omit them. If the problem is complete but the solution not, omit both the problem and the solution. DO NOT INCLUDE INCOMPLETE PROBLEMS OR ANSWERS. However, if only the solution is incomplete, you may still include the question and the short answer if they are complete and leave the solution empty.
3. Normally, a box at the beginning of a page with no problem number (such as "1.1", "例 1", "example 1", "解", "solution", "答案", "answer") is the continuation of the problem or solution from the previous page, even if it appears to be an independent paragraph. Omit them.
4. The chapter information as it appears on page_n. YOU MUST INCLUDE ALL TITLES APPEARING ON THE PAGE, EVEN IF NO QUESTIONS OR ANSWERS ARE PRESENT UNDER THAT TITLE.
"""
else:
PROMPT = f"""
You are an expert in {subject} competition. You are given an image—page_n—annotated with detected bounding boxes and corresponding labels. Your task is to extract from page_n only:
1. All {subject} problems whose text appears on page_n. In the provided page, there will be all questions or all answers with solutions, but not mixed.
2. If the problem or answer is not complete (because they continue onto page_n+1), omit them. DO NOT INCLUDE INCOMPLETE QUESTIONS OR ANSWERS. However, if only the solution is incomplete, you may still include the question and the short answer (sometimes only a letter or number) if they are complete and leave the solution empty.
3. Normally, a box at the beginning of a page with no problem number (such as "1.1", "例 1", "example 1", "解", "solution", "答案", "answer") is the continuation of the problem or solution from the previous page, even if it appears to be an independent paragraph. Omit them.
4. The chapter information as it appears on page_n. YOU MUST INCLUDE ALL TITLES APPEARING ON THE PAGE, EVEN IF NO QUESTIONS OR ANSWERS ARE PRESENT UNDER THAT TITLE.
"""
PROMPT +=f"""
When provided two column pages, you should read from **left to right**, top to bottom. Also output the extracted content from **left to right**, top to bottom.
Strict extraction rules:
** About questions and answers/solutions **
- If you think the page is not the main text page, such as a cover page, catalog page, header/footer only, etc., output `<empty></empty>`.
- Preserve each problem’s original label/number, such as "例1", "Example 3", "习题1", "11". Do not include the period after the number. Use Arabic numerals only. For example, if the label is "例一", convert it to "例1". If the label is "IV", convert it to "4". If the full label is "三、16", keep only "16".
- If there are multiple sub-questions under one main question, always put them together in the same `<qa_pair>`…`</qa_pair>` block.
- If a question and its answer/solution are contiguous on page_n, wrap them together as a single `<qa_pair>`…`</qa_pair>` block, e.g.:
`<qa_pair><label>例1</label><question>…</question><answer>…</answer><solution>…</solution></qa_pair>`
- If only questions or only answers with solutions appear on page_n, wrap each question or answer with solution in a `<qa_pair>`…`</qa_pair>` block with the missing part left empty. For example, if only questions appear:
`<qa_pair><label>例1</label><question>…</question><answer></answer><solution></solution></qa_pair>`
- If multiple questions and solutions appear on page_n, wrap each question/solution pair in its own `<qa_pair>`…`</qa_pair>` block.
- Sometimes a short answer may appear before the full solution. If you do not see the full solution on page_n, only extract the short answer and leave the solution empty.
** About chapter/section titles **
- Enclose the output in a `<chapter>`…`</chapter>` block, where <title>MAIN_TITLE</title> is the chapter title or section title appearing on page_n.
- There could be multiple `<chapter>`…`</chapter>` blocks if multiple chapters/sections appear on page_n.
- Extract chapter titles only, and with no prefix number. For example, "{example_title}", please strictly follow this example title. If you see multiple titles piled together, use the one at the bottom only.
- **Do not keep subtitles. Any titles followed by a question/answer whose label is not 1 should be considered a subtitle. DO NOT EXTRACT THEM.**
- If you encounter a title with no problems or answers on the page, still extract the title within the `<chapter>`…`</chapter>` block, with an empty qa_pair block with label 0 `<qa_pair><label>0</label><question></question><answer></answer><solution></solution></qa_pair>`. Normally this will happen when the title is at the end of the page.
- Do not use nested titles.
- Leave the title blank if there is no chapter title of the questions.
- Sometimes the chapter title may not appear at the beginning of the page. You should leave the title of all the qa pairs before the chapter title **blank**, but the title of the qa pairs after the chapter title should use this chapter title.
** About text and figures/diagrams **
- For problem and answer/solution text, output exactly what appears (no translation). Render all mathematical expressions in LaTeX.
- Whenever the question or answer/solution refers to a figure or diagram, record it with `<pic>tagA:boxB</pic>`, such as `<pic>tag5:box7</pic>`. tagA:boxB is labeled (in exactly the same format) in the image beside the figure or diagram in RED color. Be careful that the original caption of the book may also exist, but usually in format A.B (normally in black color). Do NOT use the original caption in the book!!! Additionally, the figure/diagram may be surrounded by multiple labels (some from other boxes), be careful to pick the correct one. The correct one will be at the upper left of the figure/diagram. If you are not sure, you are free to put multiple labels, e.g. `<pic>tag5:box7</pic> <pic>tag5:box8</pic>`. NEVER leave it blank or make up a label!
- You should always put the `<pic>...</pic>` tag at the exact position where the figure/diagram is referenced in the text. If there are multiple references, put multiple tags at the correct positions.
If no qualifying content is found, output:
<empty></empty>
Output format (all tags run together, no extra whitespace or newlines except between entries):
<chapter><title>MAIN_TITLE</title>
<qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
<answer>ANSWER_TEXT<pic>…</pic>…</answer><solution>SOLUTION_TEXT</solution></qa_pair>
<qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
<answer>ANSWER_TEXT<pic>…</pic>…</answer><solution></solution></qa_pair>
</chapter>
<chapter><title>MAIN_TITLE</title>
<qa_pair><label>…</label><question>QUESTION_TEXT<pic>…</pic>…</question>
<answer>ANSWER_TEXT<pic>…</pic>…</answer><solution>SOLUTION_TEXT</solution></qa_pair>
</chapter>
Example:
<chapter><title>Chapter 2</title>
<qa_pair><label>例1</label><question>Calculate \(x\) such that \(x^2-1=0\).<pic>tag5:box7</pic></question>
<answer>\(x=\pm1\).</answer><solution>SOLUTION_TEXT</solution></qa_pair>
<qa_pair><label>例2</label><question>Calculate \(x\) such that \(x^2-4=0\).<pic>tag5:box8</pic></question>
<answer>\(x=\pm2\).</answer><solution></solution></qa_pair>
</chapter>
<chapter><title>Chapter 3</title>
<qa_pair><label>例1</label><question>Calculate \(x\) such that \(x^3-1=0\).<pic>tag6:box7</pic></question>
<answer>\(x=1\).</answer><solution>SOLUTION_TEXT</solution></qa_pair>
</chapter>
<chapter><title>Chapter 4</title>
<qa_pair><label>0</label><question></question><answer></answer><solution></solution></qa_pair>
</chapter>
Please now process the provided page_n image and output your result.
"""
return PROMPT
@PROMPT_REGISTRY.register()
class QAExtractPrompt(PromptABC):
def __init__(self):
pass
def build_prompt(self, subject: str = "math") -> str:
PROMPT = f"""
You are an expert in {subject}. You are given a json file. Your task is to segment the content, insert images tags, and extract labels:
1. Every json item has an "id" field. Your main task is to output this field.
2. You need to segment the content into multiple `<qa_pair>`…`</qa_pair>` blocks, each containing a question and its corresponding answer with solution.
3. If the problem or answer/solution is not complete, omit them. An answer/solution should be considered complete as long as either the answer or solution exists.
4. You need to put the images id into proper positions. You could look at the caption or context to decide where to put the image tags.
5. You will also need to extract the chapter title and each problem's label/number from the text.
6. You only need to output "id" field for **chapter titles, questions and solutions**. DO NOT OUTPUT ORIGINAL TEXT. Use ',' to separate different ids.
7. However, use original labels/numbers for labels, and use original numbers for answers. DO NOT output "id" field for labels and answers. You will need to extract them from the text.
"""
PROMPT +=f"""
Strict extraction rules:
** About questions and answers/solutions **
- Preserve each problem’s original label/number, such as "例1", "Example 3", "习题1", "11". Do not include the period after the number. Use Arabic numerals only. For example, if the label is "例一", convert it to "例1". If the label is "IV", convert it to "4".
- If the full label is "三、16", keep only "16". If the full label is "5.4", keep only "4".
- If there are multiple sub-questions (such as "(1)", "(a)") under one main question, always put them together in the same `<qa_pair>`…`</qa_pair>` block.
- If a question and its answer/solution are contiguous, wrap them together as a single `<qa_pair>`…`</qa_pair>` block, e.g.:
`<qa_pair><label>1</label><question>…</question><answer>…</answer><solution>…</solution></qa_pair>`
- If only questions or only answers/solutions appear, wrap each question or answer/solution in a `<qa_pair>`…`</qa_pair>` block with the missing part left empty. For example, if only questions appear:
`<qa_pair><label>1</label><question>…</question><answer></answer><solution></solution></qa_pair>`
- In total, there are 7 possibilities: only question, only answer, only solution, question with answer, question with solution, answer with solution, full question and answer and solution.
- If multiple qa pairs appear, wrap each qa pair in its own `<qa_pair>`…`</qa_pair>` block.
- If you do not see the full solution, only extract the short answer and leave the solution empty. YOU MUST KEEP SHORT ANSWERS !!!
** About chapter/section titles **
- Always enclose qa pairs in a `<chapter>`…`</chapter>` block, where <title>MAIN_TITLE_ID</title> is the id of the chapter title or section title.
- Normally, chapter/section titles appear before the questions/answers in an independent json item.
- There could be multiple `<chapter>`…`</chapter>` blocks if multiple chapters/sections exist.
- **Any title followed by a question/answer whose label/number is not 1, or title with a score, should NOT be extracted.**
- Do not use nested titles.
- Leave the title blank if there is no chapter title.
** About figures/diagrams **
- Whenever the question or answer/solution refers to a figure or diagram, record its "id" in question/answer/solution just like other text content.
- You MUST include all images referenced in the question/answer/solution.
If no qualifying content is found, output:
<empty></empty>
Output format (all tags run together, no extra whitespace or newlines except between entries):
<chapter><title>MAIN_TITLE_ID</title>
<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution>SOLUTION_IDS</solution></qa_pair>
<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution></solution></qa_pair>
</chapter>
<chapter><title>MAIN_TITLE_ID</title>
<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution>SOLUTION_IDS</solution></qa_pair>
</chapter>
Example:
<chapter><title>7</title>
<qa_pair><label>1</label><question>2,3</question>
<answer>Yes</answer><solution>5,6,7</solution></qa_pair>
<qa_pair><label>2</label><question>8,9,10</question>
<answer>3.14</answer><solution></solution></qa_pair>
</chapter>
<chapter><title>12</title>
<qa_pair><label>1</label><question></question>
<answer>2^6</answer><solution>16</solution></qa_pair>
</chapter>
Please now process the provided json and output your result.
"""
return PROMPT
\ No newline at end of file
from .diy import *
from .general import *
from .math import *
\ No newline at end of file
from dataflow import get_logger
from dataflow.utils.registry import PROMPT_REGISTRY
from dataflow.core.prompt import PromptABC
'''
A collection of prompts for the diy reasoning operator.
'''
@PROMPT_REGISTRY.register()
class DiyAnswerGeneratorPrompt(PromptABC):
def __init__(self, prompt_template):
self.prompt_template = prompt_template
self.logger = get_logger()
def build_prompt(self, question: str) -> str:
try:
return self.prompt_template + question + r'''Your response must start directly with "Solution:" without any preamble. Finish your response immediately after the solution.'''
except:
self.logger.debug(f"Please check if the symbol {{question}} in prompt is missing.")
@PROMPT_REGISTRY.register()
class DiyQuestionFilterPrompt(PromptABC):
def __init__(self, prompt_template):
self.prompt_template = prompt_template
self.logger = get_logger()
def build_prompt(self, question: str) -> str:
try:
return self.prompt_template.format(question=question)
except:
self.logger.debug(f"Please check if the symbol {{question}} in prompt is missing.")
@PROMPT_REGISTRY.register()
class DiyQuestionSynthesisPrompt(PromptABC):
def __init__(self, prompt_template):
self.prompt_template = prompt_template
self.logger = get_logger()
def build_prompt(self, question: str) -> str:
try:
return self.prompt_template.format(question=question)
except:
self.logger.debug(f"Please check if the symbol {{question}} in prompt is missing.")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment