".github/scripts/run-clang-format.py" did not exist on "b81d189d82789a3c53a4d9f04afad486a5df56c3"
Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from dataflow.prompts.general_text import Phi4QAGeneratorPrompt
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict
@prompt_restrict(
Phi4QAGeneratorPrompt
)
@OPERATOR_REGISTRY.register()
class Phi4QAGenerator(OperatorABC):
'''
Answer Generator is a class that generates answers for given questions.
'''
def __init__(self, llm_serving: LLMServingABC):
self.logger = get_logger()
self.prompts = Phi4QAGeneratorPrompt()
self.llm_serving = llm_serving
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- input_key:输入文档内容字段名,默认为'raw_content'\n"
"- output_key:输出生成内容字段名,默认为'generated_content'\n"
"输出参数:\n"
"- 包含原始内容和生成内容的DataFrame\n"
"- 返回输出字段名,用于后续算子引用"
)
elif lang == "en":
return (
"Generate pre-training format multi-turn dialogue Q&A data based on the given document content. \n"
"Converts raw document content into dialogue format data suitable for language model pre-training.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- input_key: Field name for input document content, default is 'raw_content'\n"
"- output_key: Field name for output generated content, default is 'generated_content'\n\n"
"Output Parameters:\n"
"- DataFrame containing original and generated content\n"
"- Returns output field name for subsequent operator reference"
)
else:
return (
"PretrainGenerator converts document content into pre-training format multi-turn dialogue data."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "generated_content"):
self.input_key, self.output_key = input_key, output_key
self.logger.info("Running PretrainGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
llm_inputs = []
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for index, row in dataframe.iterrows():
raw_content = row.get(self.input_key, '')
if raw_content:
llm_input = self.prompts.build_prompt(raw_content)
llm_inputs.append(llm_input)
# Generate the text using the model
try:
self.logger.info("Generating text using the model...")
generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
# Add the generated content back to the dataframe
dataframe['generated_content'] = generated_outputs
# Save the updated dataframe to the output file
output_file = storage.write(dataframe)
return output_key
\ No newline at end of file
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .eval.alpagasus_sample_evaluator import AlpagasusSampleEvaluator
from .eval.deita_quality_sample_evaluator import DeitaQualitySampleEvaluator
from .eval.deita_complexity_sample_evaluator import DeitaComplexitySampleEvaluator
from .eval.instag_sample_evaluator import InstagSampleEvaluator
from .eval.rm_sample_evaluator import RMSampleEvaluator
from .eval.superfiltering_sample_evaluator import SuperfilteringSampleEvaluator
from .eval.treeinstruct_sample_evaluator import TreeinstructSampleEvaluator
from .filter.alpagasus_filter import AlpagasusFilter
from .filter.deita_quality_filter import DeitaQualityFilter
from .filter.deita_complexity_filter import DeitaComplexityFilter
from .filter.instag_filter import InstagFilter
from .filter.rm_filter import RMFilter
from .filter.superfiltering_filter import SuperfilteringFilter
from .filter.treeinstruct_filter import TreeinstructFilter
from .generate.condor_generator import CondorGenerator
from .generate.sft_generator_from_seed import SFTGeneratorSeed
from .refine.condor_refiner import CondorRefiner
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/text_sft/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_sft/", _import_structure)
import os
import json
import torch
import argparse
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
PROMPT_DICT_NONE = {
"prompt_input": (
"{instruction}\n{input}\n"
),
"prompt_no_input": (
"{instruction}\n"
),
}
# Used to get the ppl and emb for the whole input
def get_perplexity_and_embedding_whole_text(tokenizer, model, text, max_length, device):
input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
with torch.no_grad():
outputs = model(input_ids, labels=input_ids.contiguous())
loss = outputs.loss
perplexity = torch.exp(loss)
return perplexity.to('cpu').item(), loss.to('cpu').item()
# Used to get the ppl and emb for part of input, used in conditional version, and token-wise loss
def get_perplexity_and_embedding_part_text(tokenizer, model, text, target_span, max_length, device):
try:
input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
start_index = text.rfind(target_span)
start_token = len(tokenizer.encode(text[:start_index]))
end_token = input_ids.shape[1]
labels = input_ids.clone()
labels[0, :start_token] = -100
with torch.no_grad():
outputs = model(input_ids, labels=labels)
loss = outputs.loss
perplexity = torch.exp(loss)
return perplexity.to('cpu').item(), loss.to('cpu').item()
except:
return 0, 0
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.core.prompt import prompt_restrict
from dataflow.prompts.general_text import AlpagasusPrompt
@prompt_restrict(
AlpagasusPrompt
)
@OPERATOR_REGISTRY.register()
class AlpagasusSampleEvaluator(OperatorABC):
def __init__(self, llm_serving: LLMServingABC = None, dimension: str = 'quality'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_serving = llm_serving
self.score_name = 'AlpagasusScore'
self.dimension = dimension
self.prompt = AlpagasusPrompt(dimension=self.dimension)
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- dimension:评估维度,默认为'quality'\n"
"- input_instruction_key:指令字段名\n"
"- input_input_key:输入文本字段名\n"
"- input_output_key:输出文本字段名\n"
"- output_key:输出得分字段名,默认'AlpagasusScore'\n"
"输出参数:\n"
"- 包含评估得分的DataFrame"
)
elif lang == "en":
return (
"Evaluate instruction quality using GPT; higher scores indicate better quality.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- dimension: Evaluation dimension, default 'quality'\n"
"- input_instruction_key: Field name for instruction\n"
"- input_input_key: Field name for input text\n"
"- input_output_key: Field name for output text\n"
"- output_key: Field name for output score, default 'AlpagasusScore'\n"
"Output Parameters:\n"
"- DataFrame containing evaluation scores"
)
else:
return "Evaluate instruction quality using GPT; higher scores indicate better quality."
def get_score(self, samples, input_instruction_key, input_input_key, input_output_key):
system_prompts = []
user_prompts = []
for sample in samples:
instruction = sample.get(input_instruction_key, [''])
response = sample.get(input_output_key, [''])
input_text = sample.get(input_input_key, [''])
system_prompts.append(self.prompt.build_system_prompt(instruction, input_text, response))
user_prompts.append(self.prompt.build_prompt())
inputs = [system + "\n" + user for system, user in zip(system_prompts, user_prompts)]
responses = self.llm_serving.generate_from_input(user_inputs=inputs)
scores = []
for response in responses:
score_line = response.strip().split("\n")[0]
score = float(score_line.split()[0])
scores.append(score)
return scores
def eval(self, dataframe: pd.DataFrame, input_instruction_key: str, input_input_key: str, input_output_key: str):
samples = dataframe.to_dict(orient='records')
self.logger.info(f"Evaluating {self.score_name}...")
scores = self.get_score(samples, input_instruction_key, input_input_key, input_output_key)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str, input_input_key: str, input_output_key: str, output_key: str='AlpagasusScore'):
self.input_instruction_key = input_instruction_key
self.input_input_key = input_input_key
self.input_output_key = input_output_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, self.input_instruction_key, self.input_input_key, self.input_output_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from scipy.special import softmax
from dataflow import get_logger
import torch
from tqdm import tqdm
@OPERATOR_REGISTRY.register()
class DeitaComplexitySampleEvaluator(OperatorABC):
def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.model_name = "hkust-nlp/deita-complexity-scorer"
self.model_cache_dir = model_cache_dir
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.score_name = 'DeitaComplexityScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于Llama模型的Deita指令复杂性评估器,通过生成1-6分的复杂性评分评估指令难度。\n"
"输入参数:\n"
"- device:计算设备,默认为'cuda'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- max_length:最大序列长度,默认为512\n"
"- input_instruction_key:指令文本字段名,默认为'instruction'\n"
"- input_output_key:输出文本字段名,默认为'output'\n"
"- output_key:输出得分字段名,默认为'DeitaComplexityScore'\n"
"输出参数:\n"
"- 包含指令复杂性评分的DataFrame(1-6分)"
)
elif lang == "en":
return (
"Llama-based Deita instruction complexity evaluator generating 1-6 complexity scores.\n"
"Input Parameters:\n"
"- device: Computing device, default 'cuda'\n"
"- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
"- max_length: Maximum sequence length, default 512\n"
"- input_instruction_key: Field name for instruction text, default 'instruction'\n"
"- input_output_key: Field name for output text, default 'output'\n"
"- output_key: Field name for output score, default 'DeitaComplexityScore'\n"
"Output Parameters:\n"
"- DataFrame containing instruction complexity scores (1-6)"
)
else:
return "Measure instruction complexity using Llama-based Deita model."
def infer_complexity(self, input_text):
complexity_template = ("You are a helpful assistant. Please identify the complexity score of the following user query. \n##Query: {instruction}\n##Complexity: ")
user_input = complexity_template.format(instruction=input_text)
input_ids = self.tokenizer.encode(user_input, return_tensors="pt").to(self.device)
outputs = self.model.generate(input_ids, max_new_tokens=self.max_length, num_return_sequences=1, return_dict_in_generate=True, output_scores=True)
logprobs_list = outputs.scores[0][0]
# Mapping of token IDs to complexity scores
id2score = {
29896: 1, # Complexity level 1
29906: 2, # Complexity level 2
29941: 3, # Complexity level 3
29946: 4, # Complexity level 4
29945: 5, # Complexity level 5
29953: 6 # Complexity level 6
}
score_template = np.array([1, 2, 3, 4, 5, 6]) # Define the score template
score_logits = []
for k in id2score:
score_logits.append(logprobs_list[k].cpu().numpy())
score_logits = np.array(score_logits)
score_npy = softmax(score_logits, axis=0) # Apply softmax to get probabilities
score_npy = score_npy * score_template # Weight the scores by the corresponding complexity level
final_score = np.sum(score_npy, axis=0) # Sum the weighted scores to get the final score
return final_score
def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
self.logger.info(f"Evaluating {self.score_name}...")
scores = []
for sample in tqdm(dataframe[[input_instruction_key, input_output_key]].to_dict(orient='records'), desc="Deita complexity model evaluating..."):
quality_score = self.infer_complexity(sample[input_instruction_key])
scores.append(quality_score)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'DeitaComplexityScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_instruction_key, input_output_key)
dataframe[output_key] = scores
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from scipy.special import softmax
import requests
import torch
from dataflow import get_logger
from tqdm import tqdm
@OPERATOR_REGISTRY.register()
class DeitaQualitySampleEvaluator(OperatorABC):
def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.model_name = 'hkust-nlp/deita-quality-scorer'
self.model_cache_dir = model_cache_dir
self.max_length = max_length
self.token_strs = ["1", "2", "3", "4", "5", "6"]
self.score_template = np.array([1, 2, 3, 4, 5, 6])
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.score_name = 'DeitaQualityScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于Llama模型的Deita指令质量评估器,通过生成1-6分的质量评分评估指令质量。\n"
"输入参数:\n"
"- device:计算设备,默认为'cuda'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- max_length:最大序列长度,默认为512\n"
"- input_instruction_key:指令文本字段名,默认为'instruction'\n"
"- input_output_key:输出文本字段名,默认为'output'\n"
"- output_key:输出得分字段名,默认为'DeitaQualityScore'\n"
"输出参数:\n"
"- 包含指令质量评分的DataFrame(1-6分)"
)
elif lang == "en":
return (
"Llama-based Deita instruction quality evaluator generating 1-6 quality scores.\n"
"Input Parameters:\n"
"- device: Computing device, default 'cuda'\n"
"- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
"- max_length: Maximum sequence length, default 512\n"
"- input_instruction_key: Field name for instruction text, default 'instruction'\n"
"- input_output_key: Field name for output text, default 'output'\n"
"- output_key: Field name for output score, default 'DeitaQualityScore'\n"
"Output Parameters:\n"
"- DataFrame containing instruction quality scores (1-6)"
)
else:
return "Evaluate instruction quality using Llama-based Deita model."
def infer_quality(self, input_text, resp_text):
# Define the template and input format
quality_template = ("You are a helpful assistant. Please identify the quality score of the Response corresponding to the Question.\n"
"#Question#:\n{instruction}\n#Response#:\n{output}\n##Quality: ")
user_input = quality_template.format(instruction=input_text, output=resp_text)
input_ids = self.tokenizer.encode(user_input, return_tensors="pt").to(self.device)
outputs = self.model.generate(input_ids, max_new_tokens=self.max_length, num_return_sequences=1, return_dict_in_generate=True, output_scores=True)
logprobs_list = outputs.scores[0][0]
id2score = {
29896: "1",
29906: "2",
29941: "3",
29946: "4",
29945: "5",
29953: "6"
}
score_logits = []
for k in id2score:
score_logits.append(logprobs_list[k].cpu().numpy())
score_logits = np.array(score_logits)
score_npy = softmax(score_logits, axis=0)
score_npy = score_npy * self.score_template
final_score = np.sum(score_npy, axis=0)
return final_score
def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
scores = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[[input_instruction_key, input_output_key]].to_dict(orient='records'), desc="Deita quality model Evaluating..."):
quality_score = self.infer_quality(sample[input_instruction_key], sample[input_output_key]) # assuming response and instruction are the same for now
scores.append(quality_score)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'DeitaQualityScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_instruction_key, input_output_key)
dataframe[output_key] = scores
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from dataflow import get_logger
import json
@OPERATOR_REGISTRY.register()
class InstagSampleEvaluator(OperatorABC):
def __init__(self, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024, temperature=0, do_sample=False, num_return_sequences=1, return_dict_in_generate=True):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.model_name = 'OFA-Sys/InsTagger'
self.model_cache_dir = model_cache_dir
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.do_sample = do_sample
self.num_return_sequences = num_return_sequences
self.return_dict_in_generate = return_dict_in_generate
self.token_strs = ["1", "2", "3", "4", "5", "6"]
self.score_template = np.array([1, 2, 3, 4, 5, 6])
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.model.requires_grad_(False)
self.model.eval()
self.score_name = 'InstagScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签,标签数量越多表示内容多样性越大,"
"同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n"
"输入参数:\n"
"- query: 待评估的指令文本\n"
"输出参数:\n"
"- int: 标签数量(内容多样性指标)\n"
"- list: 包含标签和解释的字典列表"
)
else:
return (
"Evaluate instruction content diversity and intention tags using the Instag scorer. Generate relevant tags by analyzing instruction text, "
"with more tags indicating greater content diversity, while returning detailed explanations of tags. Implemented based on OFA-Sys/InsTagger model.\n"
"Input parameters:\n"
"- query: Instruction text to be evaluated\n"
"Output parameters:\n"
"- int: Number of tags (content diversity indicator)\n"
"- list: List of dictionaries containing tags and explanations"
)
def make_prompt(self, query):
prompt = f"Please identify tags of user intentions in the following user query and provide an explanation for each tag. Please respond in the JSON format {{\"tag\": str, \"explanation\": str}}.\nUser query: {query}"
messages = [("user", prompt), ("Assistant", None)]
seps = [" ", "</s>"]
ret = "system: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + seps[0]
for i, (role, message) in enumerate(messages):
if message:
ret += role + ": " + message + seps[i % 2]
else:
ret += role + ":"
return ret
def inference_batch(self, queries):
"""Process batch of queries using either local model or API."""
input_strs = [self.make_prompt(query) for query in queries]
input_tokens = self.tokenizer(input_strs, return_tensors="pt", padding=True)
if torch.cuda.is_available():
input_tokens = {key: value.to(self.device) for key, value in input_tokens.items()}
output = self.model.generate(
input_tokens['input_ids'],
temperature=self.temperature,
do_sample=self.do_sample,
max_new_tokens=self.max_new_tokens,
num_return_sequences=self.num_return_sequences,
return_dict_in_generate=self.return_dict_in_generate,
)
num_input_tokens = input_tokens["input_ids"].shape[1]
output_tokens = output.sequences
generated_tokens = output_tokens[:, num_input_tokens:]
generated_texts = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
json_outputs = []
for generated_text in generated_texts:
string_output = generated_text.strip()
try:
json_output = json.loads(string_output)
except json.JSONDecodeError:
self.logger.warning(f"JSON parse error: {string_output}")
json_output = {"tag": "Parsing error", "explanation": string_output[:100]}
json_outputs.append(json_output)
return json_outputs
def _score_func(self, sample, input_instruction_key):
json_output = self.inference_batch([sample])[0]
complexity_score = None
if isinstance(json_output, list):
complexity_score = len(json_output)
self.logger.info(f"列表类型JSON,标签数量: {complexity_score}")
elif isinstance(json_output, dict) and "tag" in json_output:
complexity_score = 1
self.logger.info(f"字典类型JSON,包含tag字段,评分为1")
elif isinstance(json_output, dict) and len(json_output) > 0:
complexity_score = 1
self.logger.info(f"其他字典类型JSON,评分为1: {json_output}")
else:
complexity_score = 0
self.logger.warning(f"未识别的JSON类型或空数据,评分为0: {json_output}")
return complexity_score
def eval(self, dataframe: pd.DataFrame, input_instruction_key: str):
self.logger.info(f"Evaluating {self.score_name}...")
scores = []
for sample in tqdm(dataframe[input_instruction_key], desc="Instagger mode evaluating..."):
scores.append(self._score_func(sample, input_instruction_key))
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', output_key: str = 'InstagScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_instruction_key)
dataframe[output_key] = scores
storage.write(dataframe)
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
import torch
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.utils import get_logger
# RMScorer for evaluating based on reward-model-deberta-v3-large-v2
@OPERATOR_REGISTRY.register()
class RMSampleEvaluator(OperatorABC):
def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', ):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.model_name = 'OpenAssistant/reward-model-deberta-v3-large-v2'
self.model_cache_dir = model_cache_dir
self.score_name = 'RewardModelScore'
self.device = device
self.rank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分,高分代表质量较高。"
"模型输入为指令和响应文本对,输出0-1之间的奖励分数,反映人类对文本质量的偏好判断。\n"
"输入参数:\n"
"- instruction: 指令文本字符串\n"
"- output: 响应文本字符串\n"
"输出参数:\n"
"- float: 0-1之间的奖励分数,越高表示质量越好"
)
else:
return (
"Score text quality using a reward model trained on human preference data (OpenAssistant/reward-model-deberta-v3-large-v2), where higher scores indicate better quality. "
"The model takes instruction-response text pairs as input and outputs a reward score between 0 and 1, reflecting human preference judgments on text quality.\n"
"Input parameters:\n"
"- instruction: Instruction text string\n"
"- output: Response text string\n"
"Output parameters:\n"
"- float: Reward score between 0 and 1, higher values indicate better quality"
)
def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
input_texts = dataframe.get(input_instruction_key, '').to_list()
output_texts = dataframe.get(input_output_key, '').to_list()
inputs = self.tokenizer(input_texts, output_texts, return_tensors='pt', padding=True, truncation=True).to(self.device)
self.logger.info(f"Evaluating {self.score_name}...")
with torch.no_grad():
logits = self.rank_model(**inputs).logits.cpu().detach().numpy()
scores = logits.squeeze()
if scores.ndim == 0:
scores = [float(scores)]
self.logger.info("Evaluation complete!")
return scores.tolist()
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'RMScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_instruction_key, input_output_key)
dataframe[output_key] = scores
storage.write(dataframe)
\ No newline at end of file
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.utils import get_logger
import pandas as pd
# Superfiltering instruction quality (ifd) evaluation
# cited from: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning
@OPERATOR_REGISTRY.register()
class SuperfilteringSampleEvaluator(OperatorABC):
def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.device = device
self.model_name = 'gpt2'
self.model_cache_dir = model_cache_dir
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
cache_dir=self.model_cache_dir,
output_hidden_states=True
).to(self.device)
self.score_name = 'SuperfilteringScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Superfiltering方法评估指令的跟随难度,基于GPT-2模型计算条件困惑度与独立困惑度的比值,得分越高表示指令越难跟随。"
"该方法通过比较指令条件下的响应困惑度与独立响应困惑度,评估指令的清晰度和跟随难度。\n"
"输入参数:\n"
"- instruction: 指令文本\n"
"- input_text: 输入文本(可选)\n"
"- output: 响应文本\n"
"输出参数:\n"
"- float: 困惑度比值,越高表示指令跟随难度越大"
)
else:
return (
"Evaluate the follow difficulty of instructions using the Superfiltering method, which calculates the ratio of conditional perplexity to independent perplexity based on the GPT-2 model. "
"Higher scores indicate greater difficulty in following the instruction. This method assesses instruction clarity and follow difficulty by comparing response perplexity under instruction conditions with independent response perplexity.\n"
"Input parameters:\n"
"- instruction: Instruction text\n"
"- input_text: Input text (optional)\n"
"- output: Response text\n"
"Output parameters:\n"
"- float: Perplexity ratio, higher values indicate greater instruction following difficulty"
)
def inference(self, instruction, input_text, output):
PROMPT_DICT_NONE = {
"prompt_input": (
"{instruction}\n{input}\n"
),
"prompt_no_input": (
"{instruction}\n"
),
}
prompt_no_input = PROMPT_DICT_NONE["prompt_no_input"]
prompt_input = PROMPT_DICT_NONE["prompt_input"]
if input_text == '':
temp_dict = {'instruction': instruction}
prompt_to_use = prompt_no_input.format_map(temp_dict)
whole_text = prompt_to_use + output
instruction = prompt_to_use
else:
temp_dict = {'instruction': instruction, 'input': input_text}
prompt_to_use = prompt_input.format_map(temp_dict)
whole_text = prompt_to_use + output
instruction = prompt_to_use
if output == '':
return None
instruction_input_ids = self.tokenizer.encode(instruction, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
instruction_len = instruction_input_ids.shape[1]
ppl_out_alone, _ = get_perplexity_and_embedding_whole_text(self.tokenizer, self.model, output, self.max_length - instruction_len + 1, self.device)
ppl_out_condition, _ = get_perplexity_and_embedding_part_text(self.tokenizer, self.model, whole_text, output, self.max_length, self.device)
if ppl_out_alone != 0:
score = ppl_out_condition / ppl_out_alone
else:
score = 0
if score != score: # 检查NaN
score = None
return score
def _score_func(self, sample, input_instruction_key: str = 'instruction', input_input_key: str = 'input', input_output_key: str = 'output'):
instruction = sample.get(input_instruction_key, [''])
output = sample.get(input_output_key, [''])
input_text = sample.get(input_input_key, ['']) if input_input_key is not None and input_input_key in sample else ''
if not output:
score = None
else:
score = self.inference(instruction, input_text, output)
return score
def eval(self, dataframe: pd.DataFrame, input_instruction_key: str = 'instruction', input_input_key: str = None, input_output_key: str = 'output'):
self.logger.info(f"Evaluating {self.score_name}...")
key_list = [input_instruction_key, input_output_key]
if input_input_key is not None:
key_list.append(input_input_key)
scores = [self._score_func(sample, input_instruction_key, input_input_key, input_output_key) for sample in tqdm(dataframe[key_list].to_dict(orient='records'), desc="SuperfilteringScorer evaluating...")]
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_input_key: str = None, input_output_key: str = 'output', output_key: str = 'SuperfilteringScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_instruction_key, input_input_key, input_output_key)
dataframe[output_key] = scores
storage.write(dataframe)
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.prompts.general_text import TreeinstructPrompt
from dataflow.core.prompt import prompt_restrict
@prompt_restrict(
TreeinstructPrompt
)
@OPERATOR_REGISTRY.register()
class TreeinstructSampleEvaluator(OperatorABC):
def __init__(self, llm_serving: LLMServingABC = None):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_serving = llm_serving
self.score_name = 'TreeinstructScore'
self.prompt = TreeinstructPrompt()
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- input_instruction_key:指令字段名\n"
"- output_key:输出得分字段名,默认'TreeinstructScore'\n"
"输出参数:\n"
"- 包含指令复杂性得分的DataFrame"
)
elif lang == "en":
return (
"Measure instruction complexity by syntax tree size; more nodes mean more complexity.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- input_instruction_key: Field name for instruction\n"
"- output_key: Field name for output score, default 'TreeinstructScore'\n"
"Output Parameters:\n"
"- DataFrame containing instruction complexity scores"
)
else:
return "Measure instruction complexity by syntax tree size; more nodes mean more complexity."
def get_score(self, samples, input_instruction_key):
system_prompts = []
user_prompts = []
for sample in samples:
instruction = sample.get(input_instruction_key, [''])
system_prompts.append(self.prompt.build_system_prompt(instruction))
user_prompts.append(self.prompt.build_prompt())
inputs = [system + "\n" + user for system, user in zip(system_prompts, user_prompts)]
responses = self.llm_serving.generate_from_input(user_inputs=inputs)
scores = []
for response in responses:
response_lines = response.strip().split("\n")
score_line = response_lines[-1]
score = float(score_line.split()[0])
scores.append(score)
return scores
def eval(self, dataframe: pd.DataFrame, input_instruction_key: str):
self.logger.info(f"Evaluating {self.score_name}...")
samples = dataframe.to_dict(orient='records')
scores = self.get_score(samples, input_instruction_key)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_instruction_key: str, output_key: str='TreeinstructScore'):
self.input_instruction_key = input_instruction_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, self.input_instruction_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.core import LLMServingABC
from dataflow.operators.text_sft import AlpagasusSampleEvaluator
@OPERATOR_REGISTRY.register()
class AlpagasusFilter(OperatorABC):
def __init__(self, min_score=3, max_score=5, llm_serving: LLMServingABC = None, dimension='quality'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
self.scorer = AlpagasusSampleEvaluator(llm_serving, dimension)
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量,返回一个质量得分。\n\n"
"初始化参数:\n"
"- min_score: 最低分数阈值,默认为3\n"
"- max_score: 最高分数阈值,默认为5\n"
"- llm_serving: LLM服务实例\n"
"- dimension: 评估维度,默认为'quality'(质量)\n\n"
"运行参数:\n"
"- input_instruction_key: 输入指令字段名\n"
"- input_input_key: 输入内容字段名\n"
"- input_output_key: 输出内容字段名\n"
"- output_key: 输出分数字段名,默认为'AlpagasusScore'\n\n"
"过滤逻辑:保留分数在[min_score, max_score]范围内的数据"
)
else:
return (
"Filter data using scores from the AlpagasusScorer. Evaluate instruction quality using GPT model and return a quality score.\n\n"
"Initialization Parameters:\n"
"- min_score: Minimum score threshold, default is 3\n"
"- max_score: Maximum score threshold, default is 5\n"
"- llm_serving: LLM serving instance\n"
"- dimension: Evaluation dimension, default is 'quality'\n\n"
"Run Parameters:\n"
"- input_instruction_key: Input instruction field name\n"
"- input_input_key: Input content field name\n"
"- input_output_key: Output content field name\n"
"- output_key: Output score field name, default is 'AlpagasusScore'\n\n"
"Filter Logic: Keep data with scores in [min_score, max_score] range"
)
def run(self, storage: DataFlowStorage, input_instruction_key: str, input_input_key: str, input_output_key: str, output_key: str='AlpagasusScore'):
self.input_instruction_key = input_instruction_key
self.input_input_key = input_input_key
self.input_output_key = input_output_key
self.output_key = output_key
self.logger.info(f"Running {self.__class__.__name__} with input_instruction_key = {self.input_instruction_key}, input_input_key = {self.input_input_key}, input_output_key = {self.input_output_key} and output_key = {self.output_key}...")
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_instruction_key, self.input_input_key, self.input_output_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from dataflow.operators.text_sft import DeitaComplexitySampleEvaluator
from dataflow.core import OperatorABC
import numpy as np
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class DeitaComplexityFilter(OperatorABC):
def __init__(self, min_score=3.0, max_score=5.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = DeitaComplexitySampleEvaluator(
device=device,
model_cache_dir=model_cache_dir,
max_length=max_length,
)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器,评估指令的复杂程度。\n\n"
"初始化参数:\n"
"- min_score: 最低分数阈值,默认为3.0\n"
"- max_score: 最高分数阈值,默认为5.0\n"
"- device: 运行设备,默认为'cuda'\n"
"- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n"
"- max_length: 最大序列长度,默认为512\n\n"
"运行参数:\n"
"- input_instruction_key: 输入指令字段名,默认为'instruction'\n"
"- input_output_key: 输入输出字段名,默认为'output'\n"
"- output_key: 输出分数字段名,默认为'DeitaComplexityScore'\n\n"
"评分标准:1-6分,分数越高表示指令复杂性越高\n"
"过滤逻辑:保留分数在[min_score, max_score]范围内的数据"
)
else:
return (
"Filter data using scores from the DeitaComplexityScorer. Evaluate instruction complexity using Llama-based Deita model.\n\n"
"Initialization Parameters:\n"
"- min_score: Minimum score threshold, default is 3.0\n"
"- max_score: Maximum score threshold, default is 5.0\n"
"- device: Running device, default is 'cuda'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- max_length: Maximum sequence length, default is 512\n\n"
"Run Parameters:\n"
"- input_instruction_key: Input instruction field name, default is 'instruction'\n"
"- input_output_key: Input output field name, default is 'output'\n"
"- output_key: Output score field name, default is 'DeitaComplexityScore'\n\n"
"Scoring Standard: 1-6 points, higher score indicates higher instruction complexity\n"
"Filter Logic: Keep data with scores in [min_score, max_score] range"
)
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key : str = 'output', output_key: str = "DeitaComplexityScore"):
self.input_instruction_key = input_instruction_key
self.input_output_key = input_output_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
# Get the quality scores
scores = self.scorer.eval(dataframe, input_instruction_key, input_output_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
from dataflow.operators.text_sft import DeitaQualitySampleEvaluator
from dataflow.core import OperatorABC
import numpy as np
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class DeitaQualityFilter(OperatorABC):
def __init__(self, min_score=2.5, max_score=10000.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = DeitaQualitySampleEvaluator(
device=device,
model_cache_dir=model_cache_dir,
max_length=max_length,
)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器,评估指令的质量高低。\n\n"
"初始化参数:\n"
"- min_score: 最低分数阈值,默认为2.5\n"
"- max_score: 最高分数阈值,默认为10000.0\n"
"- device: 运行设备,默认为'cuda'\n"
"- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n"
"- max_length: 最大序列长度,默认为512\n\n"
"运行参数:\n"
"- input_instruction_key: 输入指令字段名,默认为'instruction'\n"
"- input_output_key: 输入输出字段名,默认为'output'\n"
"- output_key: 输出分数字段名,默认为'DeitaQualityScore'\n\n"
"评分标准:1-6分,分数越高表示指令质量越高\n"
"过滤逻辑:保留分数在[min_score, max_score]范围内的数据"
)
else:
return (
"Filter data using scores from the DeitaQualityScorer. Evaluate instruction quality using Llama-based Deita model.\n\n"
"Initialization Parameters:\n"
"- min_score: Minimum score threshold, default is 2.5\n"
"- max_score: Maximum score threshold, default is 10000.0\n"
"- device: Running device, default is 'cuda'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- max_length: Maximum sequence length, default is 512\n\n"
"Run Parameters:\n"
"- input_instruction_key: Input instruction field name, default is 'instruction'\n"
"- input_output_key: Input output field name, default is 'output'\n"
"- output_key: Output score field name, default is 'DeitaQualityScore'\n\n"
"Scoring Standard: 1-6 points, higher score indicates higher instruction quality\n"
"Filter Logic: Keep data with scores in [min_score, max_score] range"
)
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key : str = 'output', output_key: str = "DeitaQualityScore"):
self.input_instruction_key = input_instruction_key
self.input_output_key = input_output_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
scores = self.scorer.eval(dataframe, input_instruction_key, input_output_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
from dataflow.operators.text_sft import InstagSampleEvaluator
from dataflow.core import OperatorABC
import numpy as np
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class InstagFilter(OperatorABC):
def __init__(self, min_score=0.0, max_score=1.0, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
# Initialize the scorer
self.scorer = InstagSampleEvaluator(
model_cache_dir=model_cache_dir,
device=device,
max_new_tokens=max_new_tokens
)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析,返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制,标签越多表示内容多样性越大。"
else:
return "Filter operator based on InstagScorer. Uses pre-trained Instag model to analyze instructions, returning the number of tags to evaluate content diversity. Parameters include model cache directory (model_cache_dir), computing device (device), and maximum new tokens (max_new_tokens). Filter range is controlled by min_score and max_score parameters, with more tags indicating greater content diversity."
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', output_key: str = 'InstagScore'):
self.input_instruction_key = input_instruction_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
scores = self.scorer.eval(dataframe, self.input_instruction_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_sft import RMSampleEvaluator
@OPERATOR_REGISTRY.register()
class RMFilter(OperatorABC):
def __init__(self, min_score: float = 0.2, max_score: float = 0.8, device='cuda', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = RMSampleEvaluator(device=device, model_cache_dir=model_cache_dir)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score}, max_score = {self.max_score}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分,高分代表质量较高。\n"
"奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标,可用于筛选符合人类价值观的高质量文本。\n"
"输入参数:\n"
"- min_score:保留样本的最小奖励分数阈值,默认为0.2\n"
"- max_score:保留样本的最大奖励分数阈值,默认为0.8\n"
"- device:模型运行设备,默认为'cuda'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- input_instruction_key:指令字段名,默认为'instruction'\n"
"- input_output_key:输出字段名,默认为'output'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留奖励分数在[min_score, max_score]范围内的样本\n"
"- 返回包含奖励分数字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter data using scores from the RMScorer. Quality scoring using reward model trained with human preference data, where higher scores indicate better quality.\n"
"Reward model evaluates human preference metrics such as relevance, helpfulness, and harmlessness, useful for filtering high-quality text aligned with human values.\n"
"Input Parameters:\n"
"- min_score: Minimum reward score threshold for retaining samples, default is 0.2\n"
"- max_score: Maximum reward score threshold for retaining samples, default is 0.8\n"
"- device: Model running device, default is 'cuda'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- input_instruction_key: Instruction field name, default is 'instruction'\n"
"- input_output_key: Output field name, default is 'output'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with reward scores within [min_score, max_score] range\n"
"- List containing reward score field name for subsequent operator reference"
)
else:
return "Filter data based on quality scores from human preference-trained reward model."
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'RMScore'):
self.input_instruction_key = input_instruction_key
self.input_output_key = input_output_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_instruction_key = {self.input_instruction_key}, intput_output_key = {self.input_output_key}, output_key = {self.output_key}...")
scores = np.array(self.scorer.eval(dataframe, self.input_instruction_key, self.input_output_key))
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from dataflow.operators.text_sft import SuperfilteringSampleEvaluator
import numpy as np
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class SuperfilteringFilter(OperatorABC):
def __init__(self, min_score=0.0, max_score=1.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = SuperfilteringSampleEvaluator(
device=device,
model_cache_dir=model_cache_dir,
max_length=max_length
)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度,比值越低表示指令越容易被模型理解和执行。\n"
"适用于筛选适合特定模型能力的指令数据,提高模型训练效率和效果。\n"
"输入参数:\n"
"- min_score:保留样本的最小分数阈值,默认为0.0\n"
"- max_score:保留样本的最大分数阈值,默认为1.0\n"
"- device:模型运行设备,默认为'cuda'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- max_length:文本最大长度,默认为512\n"
"- input_instruction_key:指令字段名,默认为'instruction'\n"
"- input_input_key:输入字段名,默认为'input'\n"
"- input_output_key:输出字段名,默认为'output'\n"
"- output_key:过滤结果分数字段名,默认为'SuperfilteringScore'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留分数在[min_score, max_score]范围内的样本\n"
"- 返回包含过滤结果分数字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter out low-quality data using the Superfiltering scorer. Evaluate instruction following difficulty by calculating perplexity ratio with GPT-2 model; lower ratios indicate instructions are easier for models to understand and execute.\n"
"Suitable for selecting instruction data appropriate for specific model capabilities, improving model training efficiency and effectiveness.\n"
"Input Parameters:\n"
"- min_score: Minimum score threshold for retaining samples, default is 0.0\n"
"- max_score: Maximum score threshold for retaining samples, default is 1.0\n"
"- device: Model running device, default is 'cuda'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- max_length: Maximum text length, default is 512\n"
"- input_instruction_key: Instruction field name, default is 'instruction'\n"
"- input_input_key: Input field name, default is 'input'\n"
"- input_output_key: Output field name, default is 'output'\n"
"- output_key: Filter result score field name, default is 'SuperfilteringScore'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with scores within [min_score, max_score] range\n"
"- List containing filter result score field name for subsequent operator reference"
)
else:
return "Filter low-quality data using perplexity ratio calculated with GPT-2 model."
def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_input_key: str = 'input', input_output_key: str = 'output', output_key: str = "SuperfilteringScore"):
self.input_instruction_key = input_instruction_key
self.input_input_key = input_input_key
self.input_output_key = input_output_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__ } with input_instruction_key = {self.input_instruction_key}, intput_output_key = {self.input_output_key}, output_key = {self.output_key}...")
# Get the scores for filtering
scores = self.scorer.eval(dataframe, input_instruction_key, input_input_key, input_output_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC, LLMServingABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_sft import TreeinstructSampleEvaluator
@OPERATOR_REGISTRY.register()
class TreeinstructFilter(OperatorABC):
def __init__(self, min_score: int = 7, max_score: int = 100, llm_serving: LLMServingABC = None):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = TreeinstructSampleEvaluator(llm_serving=llm_serving)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n"
"适用于筛选特定复杂度范围内的指令数据,平衡数据集难度分布,优化模型训练效果。\n"
"输入参数:\n"
"- min_score:保留样本的最小语法树节点数阈值,默认为7\n"
"- max_score:保留样本的最大语法树节点数阈值,默认为100\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- input_key:输入指令字段名\n"
"- output_key:语法树节点数字段名,默认为'TreeinstructScore'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留语法树节点数在[min_score, max_score]范围内的样本\n"
"- 返回包含语法树节点数字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter data using scores from the TreeinstructScore. Measure instruction complexity by the number of nodes in the generated syntax tree; more nodes indicate more complex instructions.\n"
"Suitable for selecting instruction data within specific complexity ranges, balancing dataset difficulty distribution and optimizing model training effectiveness.\n"
"Input Parameters:\n"
"- min_score: Minimum syntax tree node count threshold for retaining samples, default is 7\n"
"- max_score: Maximum syntax tree node count threshold for retaining samples, default is 100\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- input_key: Input instruction field name\n"
"- output_key: Syntax tree node count field name, default is 'TreeinstructScore'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with syntax tree node count within [min_score, max_score] range\n"
"- List containing syntax tree node count field name for subsequent operator reference"
)
else:
return "Filter data based on instruction complexity measured by syntax tree node count."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'TreeinstructScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
# Get the scores for filtering
scores = np.array(self.scorer.eval(dataframe, self.input_key))
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import json
import random
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.prompts.general_text import CondorQuestionPrompt
from dataflow.core.prompt import prompt_restrict
@prompt_restrict(
CondorQuestionPrompt
)
@OPERATOR_REGISTRY.register()
class CondorGenerator(OperatorABC):
def __init__(self, llm_serving: LLMServingABC = None, num_samples=15, use_task_diversity=True):
# Based on the existing topics, it is recommended to set num_samples below 5000. Otherwise, it is recommended to add topics in dataflow.prompts.general_text.CondorPrompt on your own to increase data richness
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_serving = llm_serving
self.num_questions = num_samples // 3 # 每个prompt生成3个难度的问题
self.prompt = CondorQuestionPrompt()
self.use_task_diversity = use_task_diversity # 是否使用任务场景增强多样性
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)。第一阶段生成不同难度级别的问题,第二阶段为每个问题生成对应的答案。"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- num_samples:生成样本总数,建议小于5000,默认值为15\n"
"输出参数:\n"
"- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n"
"- 返回生成的DataFrame用于后续处理"
)
elif lang == "en":
return (
"Two-stage generation of SFT-style data from scratch based on predefined knowledge tree tags (for over 5000 samples, consider increasing the number of tags). \n"
"First stage generates questions of varying difficulty levels, second stage generates answers for each question.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- num_samples: Total number of samples to generate, recommended to be less than 5000, default is 15\n\n"
"Output Parameters:\n"
"- DataFrame containing 'difficulty', 'instruction', and 'output' fields\n"
"- Returns generated DataFrame for subsequent processing"
)
else:
return (
"CondorGenerator generates SFT-style data through two-stage LLM generation based on predefined knowledge tree tags."
)
def parse_generated_responses(self, questions_responses):
questions_data = []
for response in questions_responses:
try:
if not isinstance(response, str):
raise ValueError("Invalid response type")
# 解析问题字符串,提取不同难度级别的问题
question_data = {}
lines = response.split('\n')
for line in lines:
if line.startswith("[Easy]"):
question_data["Easy"] = line.replace("[Easy][Question Start]", "").replace("[Question End]", "").strip()
elif line.startswith("[Medium]"):
question_data["Medium"] = line.replace("[Medium][Question Start]", "").replace("[Question End]", "").strip()
elif line.startswith("[Hard]"):
question_data["Hard"] = line.replace("[Hard][Question Start]", "").replace("[Question End]", "").strip()
if question_data:
questions_data.append(question_data)
except Exception as e:
self.logger.debug(f'Error while parsing the response: {str(e)}')
continue
return questions_data
def run(self, storage: DataFlowStorage, output_instruction_key: str = "instruction", output_output_key: str = "output", output_difficulty_key: str = "difficulty"):
# 生成所有的prompt
prompts = []
prompt_metadata = [] # 记录每个prompt的元信息(用于后续追踪)
for _ in range(self.num_questions):
# 每次随机选择topic, domain, theme
topic = random.choice(list(self.prompt.tag.keys()))
domain = random.choice(list(self.prompt.tag[topic].keys()))
theme = random.choice(self.prompt.tag[topic][domain])
# 如果启用任务场景多样性,随机选择一个任务类型
task_type = None
if self.use_task_diversity:
task_type = random.choice(self.prompt.task_types)
# 获取生成问题的prompt(保留原有的3难度生成逻辑)
prompt = self.prompt.build_prompt(theme, domain)
# 如果使用任务场景,在prompt中添加场景说明
if task_type:
prompt = f"""Task Scenario: {task_type}
{prompt}
Remember to frame the questions within the context of "{task_type}" scenario."""
prompts.append(prompt)
prompt_metadata.append({
'topic': topic,
'domain': domain,
'theme': theme,
'task_type': task_type
})
# 调用LLM一次性生成问题
self.logger.info("Generating questions...")
questions_responses = self.llm_serving.generate_from_input(user_inputs=prompts)
# 解析问题
self.logger.info("Parsing questions...")
questions_data = self.parse_generated_responses(questions_responses)
# 生成答案的prompt
answer_prompts = []
for question in questions_data:
for difficulty_level in ["Easy", "Medium", "Hard"]:
question_text = question.get(difficulty_level)
if question_text:
# 构造问题对应的answer prompt
answer_prompt = f"Please answer this questiong truthfully. Question: {question_text}"
answer_prompts.append(answer_prompt)
# 调用LLM一次性生成所有答案
self.logger.info("Generating answers...")
answer_responses = self.llm_serving.generate_from_input(user_inputs=answer_prompts)
# 解析答案
answers_data = []
answer_idx = 0 # 用来追踪答案响应的索引
for question in questions_data:
for difficulty_level in ["Easy", "Medium", "Hard"]:
question_text = question.get(difficulty_level)
if question_text:
# 获取对应的答案
answer_text = answer_responses[answer_idx].strip() # 获取答案
answers_data.append({
output_difficulty_key: difficulty_level,
output_instruction_key: question_text,
output_output_key: answer_text
})
answer_idx += 1 # 更新索引
# Step 4: Write to storage (e.g., save to DataFrame)
df = pd.DataFrame(answers_data)
storage.write(df)
self.logger.info(f'SFT data generated')
return [output_instruction_key, output_output_key, output_difficulty_key]
from dataflow.prompts.general_text import SFTGeneratorSeedPrompt
import re
import json
import pandas as pd
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
from dataflow.serving import LocalModelLLMServing_vllm
from dataflow.core.prompt import prompt_restrict
@prompt_restrict(
SFTGeneratorSeedPrompt
)
def extract_json_object(model_output):
"""提取第一个包含 instruction 和 output 字段的 JSON 对象"""
json_pattern = r'\{[^}]*\}'
matches = re.findall(json_pattern, model_output)
for match in matches:
try:
obj = json.loads(match)
if 'instruction' in obj and 'output' in obj:
return obj
except json.JSONDecodeError:
continue
return None
@OPERATOR_REGISTRY.register()
class SFTGeneratorSeed(OperatorABC):
def __init__(self, llm_serving: LLMServingABC, custom_prompt: str):
self.logger = get_logger()
self.prompts = SFTGeneratorSeedPrompt(custom_prompt=custom_prompt)
self.llm_serving = llm_serving
self.max_tokens = 4096
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于给定文档内容,生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对。"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- custom_prompt:用户自定义提示词\n"
"- input_key:输入文档内容字段名,默认为'raw_content'\n"
"- max_tokens:生成文本的最大token数,默认为4096\n"
"输出参数:\n"
"- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n"
"- 返回包含'instruction'和'output'字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Generate supervised fine-tuning format Q&A data based on the given document content and support user-defined content generation requirements. \n"
"Extracts information from raw documents to generate instruction-response pairs in SFT format.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- custom_prompt: User-defined custom prompt\n"
"- input_key: Field name for input document content, default is 'raw_content'\n"
"- max_tokens: Maximum number of tokens for generated text, default is 4096\n\n"
"Output Parameters:\n"
"- DataFrame containing 'instruction', 'output', and 'raw_content' fields\n"
"- List containing 'instruction' and 'output' field names for subsequent operator reference"
)
else:
return (
"SFTGeneratorSeed generates SFT format Q&A data from document content with custom prompt support."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content"):
self.input_key = input_key
self.logger.info("Running SFTGenerator...")
# Load the raw dataframe from the input file
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
# Create a list to hold all generated questions and answers
llm_inputs = []
# Prepare LLM inputs by formatting the prompt with raw content from the dataframe
for index, row in dataframe.iterrows():
raw_content = row.get(self.input_key, '')
llm_input = self.prompts.build_prompt(content=raw_content)
llm_inputs.append(llm_input)
# Generate the text using the model
try:
self.logger.info("Generating text using the model...")
outputs = self.llm_serving.generate_from_input(llm_inputs)
self.logger.info("Text generation completed.")
except Exception as e:
self.logger.error(f"Error during text generation: {e}")
return
valid_records = []
for idx, output in enumerate(outputs):
result = extract_json_object(output)
if result:
result["raw_content"] = dataframe[self.input_key].iloc[idx] # 添加原文内容
valid_records.append(result)
# Add the generated content back to the dataframe
output_df = pd.DataFrame(valid_records)
# Save the updated dataframe to the output file
output_file = storage.write(output_df)
return ['instruction', 'output']
import json
import random
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.prompts.general_text import CondorCritiquePrompt, CondorRefinePrompt
from dataflow.core.prompt import prompt_restrict
@prompt_restrict(
CondorCritiquePrompt,
CondorRefinePrompt
)
@OPERATOR_REGISTRY.register()
class CondorRefiner(OperatorABC):
def __init__(self, llm_serving: LLMServingABC = None):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_serving = llm_serving
self.critique_prompt = CondorCritiquePrompt() # 创建 CondorPrompt 类的实例
self.refine_prompt = CondorRefinePrompt()
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"两阶段优化指令回复质量:第一阶段调用API生成对回复的评论,第二阶段利用评论调用API改写回复,提升指令对质量。通过迭代优化提高问答对的整体质量。"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- input_instruction_key:输入指令字段名,默认为'instruction'\n"
"- input_output_key:输入回复字段名,默认为'output'\n"
"输出参数:\n"
"- 包含优化后回复的DataFrame\n"
"- 返回包含优化后回复字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Two-stage optimization of instruction-response quality: First stage calls API to generate critique on responses, \n"
"second stage uses critique to call API to refine responses, improving the quality of QA pairs through iterative optimization.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- input_instruction_key: Field name for input instructions, default is 'instruction'\n"
"- input_output_key: Field name for input responses, default is 'output'\n\n"
"Output Parameters:\n"
"- DataFrame containing refined responses\n"
"- List containing refined response field name for subsequent operator reference"
)
else:
return (
"CondorRefiner improves QA pair quality through two-stage critique and refinement process."
)
def generate_critique(self, question, answer):
# 批量生成 Critique
critique_prompts = [self.critique_prompt.build_prompt(q, a) for q, a in zip(question, answer)]
critique_responses = self.llm_serving.generate_from_input(critique_prompts)
return critique_responses
def generate_refined_answer(self, question, answer, critique):
# 批量生成修改后的答案
refine_prompts = [self.refine_prompt.build_prompt(q, a, c) for q, a, c in zip(question, answer, critique)]
refined_answers = self.llm_serving.generate_from_input(refine_prompts)
refined_answers = [answer.replace('[Improved Answer Start]', '').replace('[Improved Answer End]', '').strip() for answer in refined_answers]
return refined_answers
def run(self, storage: DataFlowStorage, input_instruction_key: str='instruction', input_output_key: str='output'):
df = storage.read('dataframe')
# 从 storage 获取批量问题和答案
questions = df.get(input_instruction_key).to_list()
answers = df.get(input_output_key).to_list()
# 生成 Critique
critique_responses = self.generate_critique(questions, answers)
self.logger.info(f'Generated Critiques for the answers.')
# 生成修改后的答案
refined_answers = self.generate_refined_answer(questions, answers, critique_responses)
self.logger.info(f'Refined Answers generated.')
df[input_output_key] = refined_answers
output_file = storage.write(df)
self.logger.info(f'Refined answers updated in storage.')
return [input_output_key]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment