Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
import random
import re
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import cdist
from dataflow.prompts.text2sql import Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt
from dataflow.core.prompt import prompt_restrict, DIYPromptABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC, LLMServingABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.text2sql.database_manager import DatabaseManager
from typing import Union
@prompt_restrict(Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt)
@OPERATOR_REGISTRY.register()
class Text2SQLQuestionGenerator(OperatorABC):
def __init__(self,
llm_serving: LLMServingABC,
embedding_serving: LLMServingABC,
database_manager: DatabaseManager,
question_candidates_num: int = 5,
prompt_template: Union[Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt, DIYPromptABC] = None
):
self.llm_serving = llm_serving
self.embedding_serving = embedding_serving
self.database_manager = database_manager
if prompt_template is None:
self.prompt_template = Text2SQLQuestionGeneratorPrompt()
else:
self.prompt_template = prompt_template
self.logger = get_logger()
self.question_candidates_num = question_candidates_num
random.seed(42)
@staticmethod
def get_desc(lang):
if lang == "zh":
return (
"对于每个条目,如果自然语言问题为空,生成SQL对应的自然语言问题。为保证正确,生成多个候选问题,并选择最优的。\n\n"
"输入参数:\n"
"- input_sql_key: 输入SQL列名\n"
"- input_db_id_key: 数据库ID列名\n\n"
"输出参数:\n"
"- output_question_key: 输出问题列名"
)
elif lang == "en":
return (
"This operator generates natural language questions for Text2SQL tasks if the natural language question is empty. Multiple candidate questions are generated to ensure correctness.\n\n"
"Input parameters:\n"
"- input_sql_key: The name of the input SQL column\n"
"- input_db_id_key: The name of the database ID column\n\n"
"Output parameters:\n"
"- output_question_key: The name of the output question column"
)
else:
return "Question generator for Text2SQL tasks."
def extract_column_descriptions(self, create_statements):
column_name2column_desc = dict()
pattern = r'"(\w+)"\s+\w+\s*/\*\s*(.*?)\s*\*/'
for create_statement in create_statements:
matches = re.findall(pattern, create_statement)
for column_name, description in matches:
column_name = column_name.lower()
if column_name not in column_name2column_desc:
column_name2column_desc[column_name] = description
return column_name2column_desc
def parse_llm_response(self, response, style):
explanation_pattern = re.compile(r'\[EXPLANATION-START\](.*?)\[EXPLANATION-END\]', re.DOTALL)
question_pattern = re.compile(r'\[QUESTION-START\](.*?)\[QUESTION-END\]', re.DOTALL)
external_knowledge_pattern = re.compile(r'\[EXTERNAL-KNOWLEDGE-START\](.*?)\[EXTERNAL-KNOWLEDGE-END\]', re.DOTALL)
explanation_match = explanation_pattern.search(response)
question_match = question_pattern.search(response)
external_knowledge_match = external_knowledge_pattern.search(response)
explanation_content = explanation_match.group(1).strip() if explanation_match else ""
question_content = question_match.group(1).strip() if question_match else ""
external_knowledge_content = external_knowledge_match.group(1).strip() if external_knowledge_match else ""
if explanation_content == "" or question_content == "":
return None
else:
return {
"question": question_content.strip(),
"external_knowledge": external_knowledge_content.strip()
}
def select_best_question(self, question_candidates, start_idx, embeddings):
if len(question_candidates) == 0:
return None
elif len(question_candidates) == 1:
return question_candidates[0]
elif len(question_candidates) == 2:
return random.sample(question_candidates, 1)[0]
else:
end_idx = start_idx + len(question_candidates)
candidate_embeddings = embeddings[start_idx:end_idx]
distance_matrix = cdist(candidate_embeddings, candidate_embeddings, metric='cosine')
distance_sums = distance_matrix.sum(axis=1)
min_index = np.argmin(distance_sums)
return question_candidates[min_index]
def run(self, storage: DataFlowStorage,
input_sql_key: str = "sql",
input_db_id_key: str = "db_id",
output_question_key: str = "question",
output_evidence_key: str = "evidence"
):
self.input_sql_key = input_sql_key
self.input_db_id_key = input_db_id_key
self.output_question_key = output_question_key
self.output_evidence_key = output_evidence_key
raw_dataframe = storage.read("dataframe")
existing_data = []
raw_data = []
if self.output_question_key in raw_dataframe.columns:
for _, row in raw_dataframe.iterrows():
if pd.notna(row.get(self.output_question_key)) and row.get(self.output_question_key) is not None:
existing_data.append(row.to_dict())
else:
raw_data.append(row.to_dict())
else:
raw_data = [row.to_dict() for _, row in raw_dataframe.iterrows()]
db_ids = list(set([data[self.input_db_id_key] for data in raw_data]))
db_id2column_info = dict()
for db_id in tqdm(db_ids, desc="Extracting database schema"):
create_statements, _ = self.database_manager.get_create_statements_and_insert_statements(db_id)
db_id2column_info[db_id] = self.extract_column_descriptions(create_statements)
self.logger.info("Generating question candidates...")
prompts = []
prompt_data_mapping = []
for data in tqdm(raw_data, desc="Preparing prompts"):
prompt = self.prompt_template.build_prompt(
data[self.input_sql_key],
data[self.input_db_id_key],
db_id2column_info,
self.database_manager.db_type
)
for _ in range(self.question_candidates_num):
prompts.append(prompt)
prompt_data_mapping.append({**data})
responses = self.llm_serving.generate_from_input(prompts, system_prompt="You are a helpful assistant.")
self.logger.info("Parsing responses and organizing candidates...")
grouped_responses = [responses[i:i+self.question_candidates_num] for i in range(0, len(responses), self.question_candidates_num)]
all_question_candidates = []
question_groups = []
embedding_texts = []
for data, response_group in zip(raw_data, grouped_responses):
question_candidates = []
for response in response_group:
parsed_response = self.parse_llm_response(response, data.get("style", "Formal"))
if parsed_response:
question_candidates.append(parsed_response)
text = parsed_response["external_knowledge"] + " " + parsed_response["question"]
embedding_texts.append(text.strip())
question_groups.append(question_candidates)
all_question_candidates.extend(question_candidates)
self.logger.info("Generating embeddings for all question candidates...")
if embedding_texts:
embeddings = self.embedding_serving.generate_embedding_from_input(embedding_texts)
else:
embeddings = []
processed_results = []
failed_data = []
embedding_start_idx = 0
for data, question_candidates in zip(raw_data, question_groups):
if question_candidates:
best_question = self.select_best_question(
question_candidates,
embedding_start_idx,
embeddings
)
embedding_start_idx += len(question_candidates)
if best_question:
result = {
**data,
self.output_question_key: best_question["question"],
self.output_evidence_key: best_question["external_knowledge"]
}
processed_results.append(result)
else:
self.logger.warning(f"No valid question generated for data: {data[self.input_db_id_key]}")
failed_data.append(data)
else:
self.logger.warning(f"No question candidates for data: {data[self.input_db_id_key]}")
failed_data.append(data)
if self.output_question_key in raw_dataframe.columns:
all_results = existing_data + processed_results
else:
all_results = processed_results
final_df = pd.DataFrame(all_results)
output_file = storage.write(final_df)
self.logger.info(f"Question generation results saved to {output_file}")
self.logger.info(f"Successfully processed: {len(processed_results)}")
if failed_data:
self.logger.warning(f"Failed to generate questions for: {len(failed_data)} entries")
return [self.output_question_key, self.output_evidence_key]
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# filter
from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter
from .filter.debertav3_filter import DebertaV3Filter
from .filter.fineweb_edu_filter import FineWebEduFilter
from .filter.pair_qual_filter import PairQualFilter
from .filter.perplexity_filter import PerplexityFilter
from .filter.qurating_filter import QuratingFilter
from .filter.text_book_filter import TextbookFilter
# generate
from .generate.phi4qa_generator import Phi4QAGenerator
# eval
from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator
from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator
from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator
from .eval.textbook_sample_evaluator import TextbookSampleEvaluator
from .eval.qurating_sample_evaluator import QuratingSampleEvaluator
from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator
from .eval.meta_sample_evaluator import MetaSampleEvaluator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/text_pt/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_pt/", _import_structure)
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch LLaMA model."""
from typing import List, Optional, Tuple, Union, Any
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
import torch.distributed as dist
from transformers.activations import ACT2FN
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from transformers.models.llama.configuration_llama import LlamaConfig
def try_import_flash_attention():
try:
from flash_attn import flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func, flash_attn_with_kvcache
from flash_attn.bert_padding import unpad_input, pad_input
from flash_attn.layers.rotary import apply_rotary_emb_func
except ImportError as e:
if 'flash_attn.layers.rotary' in str(e):
raise ImportError('Please install RoPE kernels: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/rotary`')
else:
raise ImportError('Please install flash_attention dependency in GPU environment')
from dataflow import get_logger
logger = logging.get_logger(__name__)
# @torch.jit.script
def rmsnorm_func(hidden_states, weight, variance_epsilon):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
return (weight * hidden_states).to(input_dtype)
class LlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
LlamaRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.register_buffer(
"variance_epsilon",
torch.tensor(eps),
persistent=False,
)
def forward(self, hidden_states):
return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
class FlashRotaryEmbedding(torch.nn.Module):
"""
The rotary position embeddings from RoFormer_ (Su et. al).
A crucial insight from the method is that the query and keys are
transformed by rotation matrices which depend on the relative positions.
Other implementations are available in the Rotary Transformer repo_ and in
GPT-NeoX_, GPT-NeoX was an inspiration
.. _RoFormer: https://arxiv.org/abs/2104.09864
.. _repo: https://github.com/ZhuiyiTechnology/roformer
.. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
"""
def __init__(self, dim: int, base=10000.0, interleaved=False, scale_base=None,
scaling_factor=1.0, pos_idx_in_fp32=True, device=None):
"""
interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
of 1st half and 2nd half (GPT-NeoX style).
pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
otherwise they might be in lower precision.
This option was added because previously (before 2023-07-02), when we construct
the position indices, we use the dtype of self.inv_freq. In most cases this would
be fp32, but if the model is trained in pure bf16 (not mixed precision), then
self.inv_freq would be bf16, and the position indices are also in bf16.
Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
embeddings for some positions will coincide.
To maintain compatibility with models previously trained in pure bf16,
we add this option.
scaling_factor: RotaryEmbedding extended with linear scaling.
"""
super().__init__()
self.dim = dim
self.base = float(base)
self.pos_idx_in_fp32 = pos_idx_in_fp32
# Generate and save the inverse frequency buffer (non trainable)
inv_freq = self._compute_inv_freq(device)
self.register_buffer("inv_freq", inv_freq, persistent=False)
self.interleaved = interleaved
self.scale_base = scale_base
self.scaling_factor = scaling_factor
scale = ((torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
/ (1.4 * dim) if scale_base is not None else None)
self.register_buffer("scale", scale)
self._seq_len_cached = 0
self._cos_cached = None
self._sin_cached = None
self._cos_k_cached = None
self._sin_k_cached = None
def _compute_inv_freq(self, device=None):
return 1 / (self.base ** (torch.arange(0, self.dim, 2, device=device,
dtype=torch.float32) / self.dim))
def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
# Reset the tables if the sequence length has changed,
# if we're on a new device (possibly due to tracing for instance),
# or if we're switching from inference mode to training
if (seqlen > self._seq_len_cached or self._cos_cached.device != device
or self._cos_cached.dtype != dtype
or (self.training and self._cos_cached.is_inference())):
self._seq_len_cached = seqlen
# We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
# And the output of arange can be quite large, so bf16 would lose a lot of precision.
# However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
if self.pos_idx_in_fp32:
t = torch.arange(seqlen, device=device, dtype=torch.float32)
t /= self.scaling_factor
# We want fp32 here as well since inv_freq will be multiplied with t, and the output
# will be large. Having it in bf16 will lose a lot of precision and cause the
# cos & sin output to change significantly.
# We want to recompute self.inv_freq if it was not loaded in fp32
if self.inv_freq.dtype != torch.float32:
inv_freq = self.inv_freq.to(torch.float32)
else:
inv_freq = self.inv_freq
else:
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
t /= self.scaling_factor
inv_freq = self.inv_freq
# Don't do einsum, it converts fp32 to fp16 under AMP
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
freqs = torch.outer(t, inv_freq)
if self.scale is None:
self._cos_cached = torch.cos(freqs).to(dtype)
self._sin_cached = torch.sin(freqs).to(dtype)
else:
power = ((torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
- seqlen // 2) / self.scale_base)
scale = self.scale.to(device=power.device) ** power.unsqueeze(-1)
# We want the multiplication by scale to happen in fp32
self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
def forward(self,
q: torch.Tensor, k: torch.Tensor,
seqlen_offset: int = 0,
unpadded_lengths: Optional[Tuple[torch.Tensor]] = None) -> Tuple[torch.Tensor, torch.Tensor]:
"""
q: (batch, seqlen, nheads, headdim)
k: (batch, seqlen, nheads, headdim)
seqlen_offset: can be used in generation where the qkv being passed in is only the last
token in the batch.
"""
if unpadded_lengths is not None:
cu_seqlens, max_seqlen = unpadded_lengths
else:
cu_seqlens, max_seqlen = None, q.shape[1]
self._update_cos_sin_cache(max_seqlen + seqlen_offset, device=q.device, dtype=q.dtype)
if self.scale is None:
return apply_rotary_emb_func(
q, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
self.interleaved, True, # inplace=True,
cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
), apply_rotary_emb_func(
k, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
self.interleaved, True, # inplace=True
cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
)
else:
assert False
class LlamaMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
@torch.jit.script
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
if n_rep == 1:
return hidden_states
final_shape = list(hidden_states.shape[:-2]) + [-1] + [hidden_states.shape[-1]]
expand_shape = [-1] * (len(hidden_states.shape) - 1) + [n_rep] + [-1]
hidden_states = hidden_states.unsqueeze(-2).expand(expand_shape)
return hidden_states.reshape(final_shape)
class LlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: LlamaConfig):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = getattr(config, "num_key_value_heads", self.num_heads)
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self.register_buffer(
"norm_factor",
torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
persistent=False,
)
if not getattr(self.config, "rope_scaling", None):
scaling_factor = 1
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
assert scaling_type == 'linear'
theta = getattr(self.config, "rope_theta", 10000)
self.rotary_emb = FlashRotaryEmbedding(
self.head_dim, base=theta, interleaved=False, scaling_factor=scaling_factor,
)
self.distributed_attn_func = flash_attn_kvpacked_func
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
h_size = hidden_states.size(-1)
has_layer_past = past_key_value is not None
if has_layer_past:
past_kv = past_key_value[0]
past_len = past_key_value[1]
else:
past_len = 0
# NOTE: Hack to include position_ids, assuming they are increasing uniformly per block
if position_ids is not None:
past_len += position_ids.min()
q = self.q_proj(hidden_states)
k = self.k_proj(hidden_states)
v = self.v_proj(hidden_states)
q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
k = k.view(*k.shape[:-1], self.num_key_value_heads, self.head_dim)
v = v.view(*v.shape[:-1], self.num_key_value_heads, self.head_dim)
q, k = self.rotary_emb(q, k, past_len, unpadded_lengths)
kv = torch.stack([k, v], -3)
kv = repeat_kv(kv, self.num_key_value_groups)
# Cache QKV values
if has_layer_past:
new_len = past_len+q.size(1)
if new_len > past_kv.size(1):
past_kv = torch.cat([past_kv, torch.empty(hidden_states.size(0), 256, 2, kv.size(3), kv.size(4), dtype=kv.dtype, device=kv.device)], 1)
past_kv[:, past_len:new_len] = kv
kv = past_kv[:, :new_len]
else:
past_kv = kv
if unpadded_lengths is not None:
# varlen, ignore padding tokens, efficient for large batch with many paddings
assert attention_mask is not None
cu_seqlens, max_seqlen = unpadded_lengths
attn_outputs = flash_attn_varlen_kvpacked_func(
q, kv,
cu_seqlens, cu_seqlens,
max_seqlen, max_seqlen,
dropout_p=0.0, softmax_scale=1.0/self.norm_factor,
causal=True, return_attn_probs=output_attentions
)
# elif use_cache and past_key_value is not None:
# attn_outputs = flash_attn_with_kvcache(
# q,
# kv[:, :, 0],
# kv[:, :, 1],
# softmax_scale=1.0/self.norm_factor,
# causal=True,
# )
else:
attn_outputs = flash_attn_kvpacked_func(
q, kv,
dropout_p=0.0,
softmax_scale=1.0/self.norm_factor,
causal=True,
return_attn_probs=output_attentions,
)
past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
attn_output = attn_outputs[0] if output_attentions else attn_outputs
attn_output = attn_output.reshape(*attn_output.shape[:-2], h_size)
attn_weights = attn_outputs[2] if output_attentions else None
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class LlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = LlamaAttention(config=config)
self.mlp = LlamaMLP(config)
self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self._fsdp_wrap = True
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
unpadded_lengths=unpadded_lengths,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class LlamaPreTrainedModel(PreTrainedModel):
config_class = LlamaConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["LlamaDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
class LlamaModel(LlamaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
Args:
config: LlamaConfig
"""
def __init__(self, config: LlamaConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
try_import_flash_attention()
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
# position_ids = None
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds
bsz = hidden_states.size(0)
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
if (
((attention_mask is not None) and (not attention_mask.all().item()))
and not use_cache
):
try: # for flash-attn latest version
hidden_states, unpad_indices, cu_seqlens, max_seqlen, _ = unpad_input(hidden_states, attention_mask)
except: # for flash-attn 2.3.3 verstion
hidden_states, unpad_indices, cu_seqlens, max_seqlen = unpad_input(hidden_states, attention_mask)
unpadded_lengths = (cu_seqlens, max_seqlen)
else:
unpadded_lengths = None
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = () if use_cache else None
for idx, decoder_layer in enumerate(self.layers):
if output_hidden_states:
if unpadded_lengths is not None:
all_hidden_states += (pad_input(hidden_states, unpad_indices, bsz, max_seqlen),)
else:
all_hidden_states += (hidden_states,)
past_key_value = past_key_values[idx] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = torch.utils.checkpoint.checkpoint(
decoder_layer,
hidden_states,
attention_mask,
position_ids,
None,
unpadded_lengths,
output_attentions,
False,
use_reentrant=False
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
unpadded_lengths=unpadded_lengths,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
if output_attentions:
all_self_attns += (layer_outputs[1],)
if unpadded_lengths is not None:
hidden_states = pad_input(hidden_states, unpad_indices, bsz, max_seqlen)
hidden_states = self.norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
class LlamaForCausalLM(LlamaPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = LlamaModel(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
try_import_flash_attention()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
avg_valid_labels_per_chunk: Optional[float] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, LlamaForCausalLM
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
logits = self.lm_head(hidden_states).float()
loss = None
if labels is not None:
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
if past_key_values:
input_ids = input_ids[:, -1:]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
class LlamaForSequenceClassification(LlamaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.model = LlamaModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
try_import_flash_attention()
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.model(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
logits = self.score(hidden_states)
if input_ids is not None:
batch_size = input_ids.shape[0]
else:
batch_size = inputs_embeds.shape[0]
if self.config.pad_token_id is None and batch_size != 1:
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
if input_ids is not None:
sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
else:
sequence_lengths = -1
pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
loss = None
if labels is not None:
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(pooled_logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(pooled_logits, labels)
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutputWithPast(
loss=loss,
logits=pooled_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
\ No newline at end of file
from datasets import load_from_disk, load_dataset, concatenate_datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from .modeling.modeling_flash_llama import LlamaForSequenceClassification
import torch
import argparse
import numpy as np
class TokenizeAndChunk:
def __init__(self, tokenizer_name, text_field, tokens_field, tokens, model_cache_dir):
self.tokens = tokens
self.tokenizer_name = tokenizer_name
self.text_field = text_field
self.tokens_field = tokens_field
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, cache_dir = model_cache_dir)
self.tokenizer.pad_token_id = 0
def __getstate__(self):
return {
"tokenizer_name": self.tokenizer_name,
"text_field": self.text_field,
"tokens_field": self.tokens_field,
"tokens": self.tokens,
}
def __setstate__(self, state):
self.__init__(**state)
def tokenize_and_chunk(self, source_tokens):
chunks_token_ids = []
chunks_token_counts = []
for seq in source_tokens:
chunks = torch.tensor(seq, dtype=torch.long).split(self.tokens)
chunks_token_ids.append([chunk.tolist() for chunk in chunks])
chunks_token_counts.append([len(x) for x in chunks])
return chunks_token_ids, chunks_token_counts
def __call__(self, example):
if self.tokens_field in example:
source_tokens = example[self.tokens_field]
else:
source_tokens = self.tokenizer(example[self.text_field], truncation=False, padding=False, add_special_tokens=False).input_ids
chunks_token_ids, chunks_token_counts = self.tokenize_and_chunk(source_tokens)
assert len(example[self.text_field]) == len(chunks_token_ids)
assert len(example[self.text_field]) == len(chunks_token_counts)
return {
"chunks_token_ids": chunks_token_ids,
"chunks_token_counts": chunks_token_counts,
}
class ModelAnnotator:
def __init__(self, model_name, labels, device_batch_size, device, model_cache_dir):
self.model_name = model_name
self.labels = labels
self.device_batch_size = device_batch_size
self.model = LlamaForSequenceClassification.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
cache_dir=model_cache_dir)
self.model.config.pad_token_id = 0
self.model.eval()
self.device = device
print(f"Using device {self.device}")
self.model.to(self.device)
self.num_labels = len(labels)
assert self.num_labels == self.model.config.num_labels, f"Number of labels ({self.num_labels}) does not match model config ({self.model.config.num_labels})"
def __getstate__(self):
return {
"model_name": self.model_name,
"labels": self.labels,
"device_batch_size": self.device_batch_size,
}
def __setstate__(self, state):
self.__init__(**state)
@torch.inference_mode()
def score_chunks(self, chunks_token_ids, chunks_token_counts):
sorted_indices = torch.argsort(chunks_token_counts)
scores = torch.zeros(len(chunks_token_ids), self.num_labels, dtype=torch.float32)
for batch_indices in sorted_indices.split(self.device_batch_size):
max_len = chunks_token_counts[batch_indices].max()
input_ids = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
attention_mask = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
for i, j in enumerate(batch_indices):
seq = chunks_token_ids[j]
input_ids[i, :len(seq)] = seq
attention_mask[i, :len(seq)] = 1
outputs = self.model(input_ids.to(self.device), attention_mask=attention_mask.to(self.device), use_cache=False)
scores[batch_indices] = outputs.logits.float().cpu()
return scores
def __call__(self, example, indices):
num_seqs = len(indices)
source_ids = [i for i, counts in enumerate(example["chunks_token_counts"]) for _ in range(len(counts))]
chunks_token_ids = [torch.tensor(chunk, dtype=torch.long) for chunks in example["chunks_token_ids"] for chunk in chunks]
flattened_chunks_token_counts = torch.tensor([chunk for chunks in example["chunks_token_counts"] for chunk in chunks], dtype=torch.long)
flattened_scores = self.score_chunks(chunks_token_ids, flattened_chunks_token_counts)
chunk_token_counts = example["chunks_token_counts"]
chunk_scores = [[[] for _ in range(num_seqs)] for _ in range(self.num_labels)]
for source_id, score in zip(source_ids, flattened_scores):
for label in range(self.num_labels):
chunk_scores[label][source_id].append(score[label].item())
output = {
"index": indices,
"chunk_lengths": chunk_token_counts,
"length": [sum(counts) for counts in chunk_token_counts],
}
for i, label in enumerate(self.labels):
output[f"{label}_chunks"] = chunk_scores[i]
output[f"{label}_average"] = [
np.average(scores, weights=token_counts).item()
for scores, token_counts in zip(chunk_scores[i], chunk_token_counts)
]
return output
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input", type=str)
parser.add_argument("output", type=str)
parser.add_argument("-F", "--data_files", type=str, nargs="+", default=[])
parser.add_argument("-S", "--shard", type=int, nargs=2, default=[0, 1])
parser.add_argument("-M", "--model", type=str, required=True)
parser.add_argument("-t", "--tokens", type=int, default=512)
parser.add_argument("--map_batch_size", type=int, default=512)
parser.add_argument("-b", "--device_batch_size", type=int, default=16)
parser.add_argument("-w", "--num_workers", type=int, default=1)
parser.add_argument("--text_field", type=str, default="text")
parser.add_argument("--tokens_field", type=str, default="input_ids")
parser.add_argument("--labels", type=str, nargs="+")
args = parser.parse_args()
print(args)
if args.input == "json":
dataset = load_dataset("json", data_files=args.data_files, split="train")
else:
dataset = load_from_disk(args.input)
src_dataset = dataset.shard(args.shard[1], args.shard[0], contiguous=True)
dataset = src_dataset
print(dataset)
print("Total number of examples:", len(dataset))
dataset = dataset.map(
TokenizeAndChunk(args.model, args.text_field, args.tokens_field, args.tokens),
batched=True,
batch_size=args.map_batch_size,
num_proc=args.num_workers,
remove_columns=dataset.column_names)
print("After tokenization: Total number of examples:", len(dataset))
dataset = dataset.map(
ModelAnnotator(args.model, args.labels, args.device_batch_size),
batched=True,
with_indices=True,
batch_size=args.map_batch_size,
remove_columns=dataset.column_names)
dataset = concatenate_datasets([dataset, src_dataset], axis=1)
print("After annotation: Total number of examples:", len(dataset))
print(f"Saving to {args.output}")
dataset.save_to_disk(args.output)
\ No newline at end of file
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import PyTorchModelHubMixin
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from tqdm import tqdm
from dataflow import get_logger
@OPERATOR_REGISTRY.register()
class DebertaV3SampleEvaluator(OperatorABC):
def __init__(self, model_name, model_cache_dir='./dataflow_cache', device='cuda'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.model_name = model_name
self.model_cache_dir = model_cache_dir
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.score_name = 'DebertaV3Score'
self.config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = QualityModel.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.model.eval()
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量并返回分类结果。\n"
"输入参数:\n"
"- model_name:预训练模型名称\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- device:计算设备,默认为'cuda'\n"
"- input_key:输入文本字段名\n"
"- output_key:输出分类结果字段名,默认为'Debertav3Score'\n"
"输出参数:\n"
"- 包含文本质量分类结果的DataFrame"
)
elif lang == "en":
return (
"Text quality classifier based on Nvidia Deberta V3 model for quality assessment and classification.\n"
"Input Parameters:\n"
"- model_name: Pretrained model name\n"
"- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
"- device: Computing device, default 'cuda'\n"
"- input_key: Field name for input text\n"
"- output_key: Field name for output classification, default 'Debertav3Score'\n"
"Output Parameters:\n"
"- DataFrame containing text quality classification results"
)
else:
return "Text quality classifier based on Nvidia Deberta V3."
def _score_func(self, sample):
inputs = self.tokenizer(
sample, return_tensors="pt", padding="longest", truncation=True
).to(self.device)
with torch.no_grad():
outputs = self.model(inputs["input_ids"], inputs["attention_mask"])
predicted_classes = torch.argmax(outputs, dim=1)
predicted_domains = [
self.config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()
]
return predicted_domains[0] # Assuming one sample per batch
def eval(self, dataframe, input_key):
scores = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="DebertaV3 modle evaluating..."):
score = self._score_func(sample)
scores.append(score)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='Debertav3Score'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
class QualityModel(nn.Module, PyTorchModelHubMixin):
def __init__(self, config):
super(QualityModel, self).__init__()
self.model = AutoModel.from_pretrained(config["base_model"])
self.dropout = nn.Dropout(config["fc_dropout"])
self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
def forward(self, input_ids, attention_mask):
features = self.model(
input_ids=input_ids, attention_mask=attention_mask
).last_hidden_state
dropped = self.dropout(features)
outputs = self.fc(dropped)
return torch.softmax(outputs[:, 0, :], dim=1)
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dataflow.core import OperatorABC
from dataflow import get_logger
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from tqdm import tqdm
import numpy as np
@OPERATOR_REGISTRY.register()
class FineWebEduSampleEvaluator(OperatorABC):
def __init__(self, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.model_name = 'HuggingFaceTB/fineweb-edu-classifier'
self.model_cache_dir = model_cache_dir
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.batch_size = 1
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
self.model.eval()
self.score_name = 'FineWebEduScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估,返回0-1之间的分数,"
"分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n"
"输入参数:\n"
"- text: 待评估的文本字符串\n"
"输出参数:\n"
"- float: 0-1之间的教育价值分数,越高表示教育价值越大"
)
else:
return (
"Evaluate the educational value of text using the Fineweb-Edu classifier. This classifier uses a pre-trained sequence classification model "
"to assess text and returns a score between 0 and 1, where higher scores indicate greater educational value. Suitable for filtering educational content.\n"
"Input parameters:\n"
"- text: Text string to be evaluated\n"
"Output parameters:\n"
"- float: Educational value score between 0 and 1, higher values indicate greater educational value"
)
def _score_func(self, sample):
tokenized_inputs = self.tokenizer(sample, return_tensors="pt", padding="longest", truncation=True).to(self.device)
with torch.no_grad():
outputs = self.model(**tokenized_inputs)
logits = outputs.logits.squeeze(-1).float().detach().cpu().numpy()
return logits.tolist()[0] # Return as list for individual sample
def eval(self, dataframe, input_key):
scores = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="Fineweb-edu model evaluating..."):
score = self._score_func(sample)
scores.append(score)
self.logger.info("Evaluation complete!")
return np.array(scores)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='FinewebEduScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.prompts.general_text import MetaPrompt
import ast
from dataflow.core.prompt import prompt_restrict
example_dimensions = [
{
"dimension_name": "Text Structure",
"description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.",
"example_list": [
{
"text": "The experimental procedure was meticulously documented, with each variable clearly defined.",
"score": "5"
},
{
"text": "teh data was wrong and we dont no why it happen like that",
"score": "2"
}
]
},
{
"dimension_name": "Diversity and Complexity",
"description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.",
"example_list": [
{
"text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.",
"score": "5"
},
{
"text": "Dogs are pets. They bark. They are friendly.",
"score": "2"
}
]
},
{
"dimension_name": "Fluency and Understandability",
"description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.",
"example_list": [
{
"text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.",
"score": "5"
},
{
"text": "The problem was and then fixed by something happens deployment successful maybe.",
"score": "2"
}
]
},
{
"dimension_name": "Safety",
"description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).",
"example_list": [
{
"text": "The software collects anonymous usage data to improve performance.",
"score": "5"
},
{
"text": "You idiot, your address 123 Main St will be posted online.",
"score": "1"
}
]
},
{
"dimension_name": "Educational Value",
"description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.",
"example_list": [
{
"text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.",
"score": "5"
},
{
"text": "The sky is blue. Water is wet. This is how it is.",
"score": "2"
}
]
},
{
"dimension_name": "Content Accuracy and Effectiveness",
"description": "Assess the truthfulness, relevance, and practical usefulness of the content.",
"example_list": [
{
"text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.",
"score": "5"
},
{
"text": "The Earth is flat and doesn't rotate around the Sun.",
"score": "1"
}
]
}
]
@prompt_restrict(
MetaPrompt
)
@OPERATOR_REGISTRY.register()
class MetaSampleEvaluator(OperatorABC):
def __init__(self,
llm_serving: LLMServingABC = None,
dimensions: list[dict] = example_dimensions,
):
"""
Operator that evaluate the quality of the text based on the given dimensions.
Argument Dimensions should be list of dict, each dict should contain:
{
"dimension_name": "Dimension Name",
"description": "Description of the dimension",
"example_list": [ // a list of example text and score
{
"text": "example1 text to be evaluated",
"score": "the score of this dimension of the text above"
},
{
"text": "example2 text to be evaluated",
"score": "the score of this dimension of the text above"
}
]
}
"""
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_serving = llm_serving
self.score_name = 'MetaScore'
self.prompt = MetaPrompt(dimensions=dimensions)
self.logger.info(f'{self.__class__.__name__} initialized.')
self.dimensions = dimensions
for item in dimensions:
if 'dimension_name' not in item or 'description' not in item or 'example_list' not in item:
raise ValueError('Invalid dimension format. Refer to the docstring for the correct format.')
for example in item['example_list']:
if 'text' not in example or 'score' not in example:
raise ValueError('Invalid example format. Refer to the docstring for the correct format.')
self.output_columns = [item['dimension_name'] for item in dimensions]
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"通过LLM评估文本的多个元属性,包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- dimensions:评估维度列表,每个维度对应的字典中包含dimension_name,description,和示例字段:\n"
" * dimension_name:维度名称\n"
" * description:维度的描述\n"
" * example_list:包含示例文本和得分的列表\n"
"- input_key:输入文本字段名\n"
"输出参数:\n"
"- 包含6个评估维度得分的DataFrame,列名为:Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness"
)
elif lang == "en":
return (
"Evaluate multiple meta attributes of text using LLM, including Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, and Content Accuracy & Effectiveness.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- dimensions: List of evaluation dimensions, each dimension corresponding to a dictionary containing dimension_name, description, and example field:\n"
" * dimension_name: Name of the dimension\n"
" * description: Description of the dimension\n"
" * example_list: List containing example texts and scores\n"
"- input_key: Field name for input text\n"
"Output Parameters:\n"
"- DataFrame containing scores for 6 evaluation dimensions with columns: Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness"
)
else:
return "Evaluate multiple meta attributes of text using LLM."
def get_score(self, samples, input_key):
system_prompt = self.prompt.build_system_prompt()
user_prompts = []
for sample in samples:
input_text = sample.get(input_key, '')
user_prompt = self.prompt.build_prompt(input_text)
full_prompt = system_prompt + "\n" + user_prompt
user_prompts.append(full_prompt)
responses = self.llm_serving.generate_from_input(user_inputs=user_prompts)
scores = []
for i, response in enumerate(responses):
try:
lines = response.strip().split("\n")
last_line = lines[-1].strip()
parsed_scores = ast.literal_eval(last_line)
if isinstance(parsed_scores, list) and len(parsed_scores) == 6:
scores.append(parsed_scores)
else:
raise ValueError("Score format invalid")
except Exception as e:
self.logger.warning(f"Failed to extract score from response {i}: {e}")
scores.append([float('nan')] * 6)
return scores
def eval(self, dataframe: pd.DataFrame, input_key: str):
samples = dataframe.to_dict(orient='records')
self.logger.info(f"Evaluating {self.score_name}...")
scores = self.get_score(samples, input_key)
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str):
self.input_key = input_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, self.input_key)
# 展开6列固定命名
score_df = pd.DataFrame(scores, columns=self.output_columns)
dataframe = pd.concat([dataframe, score_df], axis=1)
storage.write(dataframe)
import torch
from torch import nn
from transformers import BertModel, BertConfig, PreTrainedModel, AutoTokenizer
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from tqdm import tqdm
from dataflow.utils.utils import get_logger
import numpy as np
@OPERATOR_REGISTRY.register()
class PairQualSampleEvaluator(OperatorABC):
def __init__(self, model_cache_dir:str='./dataflow_cache', device="cuda", lang='en', max_length=512):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.model_cache_dir = model_cache_dir
self.lang = lang
self.max_length = max_length
self.score_name = 'PairQualScore'
if lang not in ['en', 'zh']:
raise ValueError("Invalid value for 'lang'. Only 'en' or 'zh' are allowed.")
if self.lang == 'en':
model = "zks2856/PairQual-Scorer-en"
config = BertConfig.from_pretrained(model, cache_dir=self.model_cache_dir)
self.model = BertForRegression_en.from_pretrained(model, config=config, trust_remote_code=True, cache_dir=self.model_cache_dir).to(self.device).eval()
self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True, cache_dir=self.model_cache_dir)
else:
model = "zks2856/PairQual-Scorer-zh"
config = BertConfig.from_pretrained(model, cache_dir=self.model_cache_dir)
self.model = BertForRegression_zh.from_pretrained(model, config=config, trust_remote_code=True, cache_dir=self.model_cache_dir).to(self.device).eval()
self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True, cache_dir=self.model_cache_dir)
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于BGE模型和GPT成对比较数据训练的文本质量评分器,支持中英文输入。通过对文本进行单样本评估,返回0-1之间的质量分数,"
"分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n"
"输入参数:\n"
"- text: 待评估的文本字符串\n"
"- lang: 语言类型,可选'en'或'zh'\n"
"输出参数:\n"
"- float: 0-1之间的质量分数,越高表示质量越好"
)
else:
return (
"Text quality scorer trained on BGE model and GPT pairwise comparison data, supporting bilingual input. Evaluate text through single-sample assessment, "
"returning a quality score between 0 and 1, where higher scores indicate better text quality. Models include English version (zks2856/PairQual-Scorer-en) and Chinese version (zks2856/PairQual-Scorer-zh).\n"
"Input parameters:\n"
"- text: Text string to be evaluated\n"
"- lang: Language type, optional 'en' or 'zh'\n"
"Output parameters:\n"
"- float: Quality score between 0 and 1, higher values indicate better quality"
)
def inference(self, input_text):
inputs = self.tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length).to(self.device)
with torch.no_grad():
_, score = self.model(inputs)
return score.item()
def eval(self, dataframe, input_key):
self.logger.info(f"Evaluating {self.score_name}...")
scores = []
for sample in tqdm(dataframe[input_key], desc="PairQualScorer Evaluating..."):
score = self.inference(sample)
scores.append(score)
self.logger.info("Evaluation complete!")
return np.array(scores)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='PairQualScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
dataframe[output_key] = scores
storage.write(dataframe)
class BertForRegression_en(PreTrainedModel):
config_class = BertConfig
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.regression = nn.Sequential(
nn.Linear(config.hidden_size, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, 1)
)
self.post_init()
def forward(self, inputs):
encoded = self.bert(**inputs)
score = self.regression(encoded['pooler_output'])
return encoded, score
class BertForRegression_zh(PreTrainedModel):
config_class = BertConfig
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.regression = nn.Sequential(
nn.Linear(config.hidden_size, 256),
nn.ReLU(),
nn.Linear(256, 1)
)
self.post_init()
def forward(self, inputs):
encoded = self.bert(**inputs)
score = self.regression(encoded['pooler_output'])
return encoded, score
\ No newline at end of file
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.utils import get_logger
@OPERATOR_REGISTRY.register()
class PerplexitySampleEvaluator(OperatorABC):
def __init__(self, model_name: str = 'gpt2', device='cuda'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.model_name = model_name
self.score_name = 'PerplexityScore'
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
# Load Hugging Face model and tokenizer
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
self.model.eval() # Set the model to evaluation mode
self.logger.info(f'{self.__class__.__name__} initialized with model {self.model_name}.')
except Exception as e:
self.logger.error(f"Error loading model: {e}")
raise RuntimeError(f"Model loading failed. Please ensure the model is available from Hugging Face.")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于Huggingface语言模型计算文本的困惑度(Perplexity),困惑度越低表示文本的流畅性和可理解性越高。"
"输入参数:\n"
"- model_name:Huggingface模型路径或名称\n"
"- device:模型运行设备\n"
"输出参数:\n"
"- float: 困惑度值,越低表示文本流畅性越好"
)
else:
return (
"Calculate text perplexity using a Huggingface language model; lower perplexity indicates better fluency and understandability."
"Input Parameters:\n"
"- model_name: Huggingface model path or name\n"
"- device: Model device\n\n"
"Output Parameters:\n"
"- float: Perplexity score, lower values indicate better fluency and understandability"
)
def eval(self, dataframe, input_key):
input_texts = dataframe.get(input_key, '').to_list()
self.logger.info(f"Evaluating {self.score_name}...")
results = []
# Use tqdm to show progress
for text in tqdm(input_texts, desc="Evaluating perplexity", unit="text"):
perplexity = self.calculate_perplexity(text)
results.append(perplexity)
self.logger.info("Evaluation complete!")
return results
def calculate_perplexity(self, text: str) -> float:
""" 使用Hugging Face模型计算困惑度 """
# Encode the input text
inputs = self.tokenizer(text, return_tensors='pt', padding="longest", truncation=True).to(self.device)
# Calculate log probability
with torch.no_grad():
outputs = self.model(**inputs, labels=inputs['input_ids'])
log_likelihood = outputs.loss * inputs['input_ids'].size(1)
# Perplexity calculation formula: exp(log_prob / N) -> Perplexity = exp(-average log probability)
perplexity = torch.exp(log_likelihood / inputs['input_ids'].size(1)).item()
return perplexity
def run(self, storage: DataFlowStorage, input_key: str = 'raw_content', output_key: str = 'PerplexityScore'):
# Read the data, evaluate the score, and save the results
dataframe = storage.read("dataframe")
self.logger.info(f"Perplexity score ready to evaluate.")
scores = self.eval(dataframe, input_key)
dataframe[output_key] = scores
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from datasets import Dataset
from tqdm import tqdm
from dataflow import get_logger
from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import ModelAnnotator
from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk
import torch
@OPERATOR_REGISTRY.register()
class QuratingSampleEvaluator(OperatorABC):
def __init__(self, map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda',
labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.model = 'princeton-nlp/QuRater-1.3B'
self.tokens_field = 'input_ids'
self.tokens = 512
self.map_batch_size = map_batch_size
self.batch_size = -1
self.num_workers = num_workers
self.model_cache_dir = model_cache_dir
self.labels = labels or []
self.device_batch_size = device_batch_size
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
self.score_type = float
self.data_type = 'text'
self.score_name = 'QuratingScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量:写作风格(writing_style)、所需专业程度(required_expertise)、"
"事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数,综合评估文本的整体质量。\n"
"输入参数:\n"
"- text: 待评估的文本字符串\n"
"- labels: 评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
"输出参数:\n"
"- dict: 包含各维度分数的字典,键为维度名称,值为0-1之间的分数"
)
else:
return (
"Evaluate text quality across four dimensions using the Qurating model (princeton-nlp/QuRater-1.3B): writing_style, required_expertise, "
"facts_and_trivia, and educational_value. Each dimension returns a score between 0 and 1, providing a comprehensive assessment of overall text quality.\n"
"Input parameters:\n"
"- text: Text string to be evaluated\n"
"- labels: List of evaluation dimensions, default ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
"Output parameters:\n"
"- dict: Dictionary containing scores for each dimension, with keys as dimension names and values as scores between 0 and 1"
)
def _score_func(self, sample):
"""Process a single sample and return the score."""
batch_dict = {'text': [sample]} # Wrap sample into a list for processing
dataset = Dataset.from_dict(batch_dict)
# Tokenize and chunk
dataset = dataset.map(
TokenizeAndChunk(self.model, 'text', self.tokens_field, self.tokens, self.model_cache_dir),
batched=True,
batch_size=self.map_batch_size,
num_proc=self.num_workers,
remove_columns=dataset.column_names
)
# Annotate the model results
dataset = dataset.map(
ModelAnnotator(self.model, self.labels, self.device_batch_size, self.device, self.model_cache_dir),
batched=True,
with_indices=True,
batch_size=self.map_batch_size,
remove_columns=dataset.column_names
)
results_dict = dataset.to_dict()
result_filtered = {}
for key in results_dict:
for label in self.labels:
average_key = f"{label}_average"
if average_key in results_dict[key]:
new_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
result_filtered[new_key] = results_dict[key]
return result_filtered
def eval(self, dataframe, input_key):
self.logger.info(f"Evaluating {self.score_name}...")
batch_dict = {'text': dataframe[input_key]} # Wrap sample into a list for processing
dataset = Dataset.from_dict(batch_dict)
# Tokenize and chunk
dataset = dataset.map(
TokenizeAndChunk(self.model, 'text', self.tokens_field, self.tokens, self.model_cache_dir),
batched=True,
batch_size=self.map_batch_size,
num_proc=self.num_workers,
remove_columns=dataset.column_names
)
# Annotate the model results
dataset = dataset.map(
ModelAnnotator(self.model, self.labels, self.device_batch_size, self.device, self.model_cache_dir),
batched=True,
with_indices=True,
batch_size=self.map_batch_size,
remove_columns=dataset.column_names
)
results_dict = dataset.to_dict()
result_filtered = {}
for label in self.labels:
average_key = f"{label}_average"
if average_key in results_dict:
new_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
result_filtered[new_key] = results_dict[average_key] # Use the average values
self.logger.info("Evaluation complete!")
return result_filtered
def run(self, storage: DataFlowStorage, input_key: str, output_key: str):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
for score_dict in scores:
for key, value in score_dict.items():
if key not in dataframe:
dataframe[key] = value
storage.write(dataframe)
from typing import List
import re
from huggingface_hub import hf_hub_download
import fasttext
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from tqdm import tqdm
import numpy as np
@OPERATOR_REGISTRY.register()
class TextbookSampleEvaluator(OperatorABC):
def __init__(self, model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
model_path = hf_hub_download(
repo_id='kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2',
filename='model.bin',
cache_dir=model_cache_dir
)
low_score=1.0
mid_score=3.0
high_score=5.0
self.model = fasttext.load_model(model_path)
self.score_type = float
self.data_type = 'text'
self.score_name = 'TextbookScore'
self.score_dict = {
'__label__Low': low_score,
'__label__Mid': mid_score,
'__label__High': high_score
}
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值,将文本分为低(Low)、中(Mid)、高(High)三个等级,"
"并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n"
"输入参数:\n"
"- text: 待评估的文本字符串\n"
"输出参数:\n"
"- float: 教育价值分数,可能值为1.0(低)、3.0(中)、5.0(高)"
)
else:
return (
"Assess the educational value of text using a FastText classifier (kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2), categorizing text into Low, Mid, and High levels, "
"mapped to scores of 1.0, 3.0, and 5.0 respectively. Suitable for filtering high-quality text content suitable as teaching materials.\n"
"Input parameters:\n"
"- text: Text string to be evaluated\n"
"Output parameters:\n"
"- float: Educational value score, possible values 1.0 (Low), 3.0 (Mid), 5.0 (High)"
)
@staticmethod
def replace_newlines(text: str) -> str:
return re.sub("\n+", " ", text)
def _score_func(self, text_list: List[str]) -> List[float]:
text_list = [self.replace_newlines(text) for text in text_list]
pred = self.model.predict(text_list, k=-1)
score_list = []
for labels, scores in zip(*pred):
score = 0
for label, score_value in zip(labels, scores):
score += self.score_dict.get(label, 0) * score_value
score_list.append(float(score))
return score_list
def eval(self, dataframe, input_key):
scores = []
text_list = dataframe[input_key]
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(text_list, desc="TextbookScorer Evaluating..."):
score = self._score_func([sample])
scores.append(score)
self.logger.info("Evaluation complete!")
return np.array(scores)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='TextbookScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key, output_key)
for i, score_list in enumerate(scores):
dataframe[output_key] = score_list
storage.write(dataframe)
# import sys
# from dataflow.utils.registry import LazyLoader
# cur_path = "dataflow/operators/filter/"
# _import_structure = {
# # Primary filters
# "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"),
# "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"),
# "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"),
# "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"),
# "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"),
# "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"),
# "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"),
# "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"),
# "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"),
# "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"),
# "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"),
# "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"),
# "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"),
# "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"),
# "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"),
# "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"),
# "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"),
# "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"),
# "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"),
# # Heuristic filters
# "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"),
# "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"),
# "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"),
# "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"),
# "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"),
# "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"),
# "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"),
# "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"),
# "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"),
# "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"),
# "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"),
# "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"),
# "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"),
# "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"),
# "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"),
# "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"),
# "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"),
# "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"),
# "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"),
# "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"),
# "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"),
# # Deduplicators
# "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"),
# "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"),
# "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"),
# "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"),
# "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"),
# "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"),
# }
# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure)
\ No newline at end of file
import hashlib
import struct
from tqdm import tqdm
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
def sha1_hash(data: bytes, d: int = 32) -> int:
"""
Generate a d-bit hash value from the given data.
Parameters
----------
data : bytes
The data to be hashed.
d : int
The number of bits of the hash value.
Returns
-------
int
The hash value.
Examples
--------
>>> sha1_hash(b"hello world", 32)
896314922
>>> sha1_hash(b"hello world", 64)
13028719972609469994
>>> sha1_hash(b"hello world", 128)
310522945683037930239412421226792791594
"""
if d == 32:
return struct.unpack("<I", hashlib.sha1(data, usedforsecurity=False).digest()[:4])[0]
if d == 64:
return struct.unpack("<Q", hashlib.sha1(data, usedforsecurity=False).digest()[:8])[0]
# struct is faster but does not support arbitrary bit lengths
return int.from_bytes(hashlib.sha1(data, usedforsecurity=False).digest()[: d // 8], byteorder="little")
@OPERATOR_REGISTRY.register()
class CCNetDeduplicateFilter(OperatorABC):
def __init__(self, bit_length: int = 64):
self.logger = get_logger()
self.bit_length = bit_length
self.logger.info(f"Initializing {self.__class__.__name__} with bit length = {bit_length}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"CCNet去重方法,基于SHA-1哈希算法的前N位进行重复识别,实现精确去重。\n\n"
"初始化参数:\n"
"- bit_length: 哈希值的位数,默认为64位\n\n"
"运行参数:\n"
"- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n"
"- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n"
"- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n"
"输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据"
)
else:
return (
"CCNet deduplication method. Identify duplicates using first N bits of SHA-1 hash for exact deduplication.\n\n"
"Initialization Parameters:\n"
"- bit_length: Number of bits for hash value, default is 64\n\n"
"Run Parameters:\n"
"- input_keys: List of multiple fields for hash calculation (alternative to input_key)\n"
"- input_key: Single field name for hash calculation (alternative to input_keys)\n"
"- output_key: Deduplication label field name, default is 'minhash_deduplicated_label'\n\n"
"Output Description: Data marked as 1 indicates first occurrence, 0 indicates duplicate"
)
def _compute_hash(self, text: str) -> str:
return sha1_hash(text, self.bit_length)
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
seen_hashes = set()
dataframe = storage.read("dataframe")
labels = [0] * len(dataframe)
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
text = text.encode('utf-8')
hash_value = self._compute_hash(text)
if hash_value not in seen_hashes:
labels[idx] = 1
seen_hashes.add(hash_value)
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_pt import DebertaV3SampleEvaluator
@OPERATOR_REGISTRY.register()
class DebertaV3Filter(OperatorABC):
def __init__(self, allowed_scores : list = ['Medium', 'High'], model_name='nvidia/quality-classifier-deberta', model_cache_dir='./dataflow_cache', device='cuda', batch_size=16):
self.logger = get_logger()
self.allowed_scores = allowed_scores
self.scorer = DebertaV3SampleEvaluator(
model_name=model_name,
model_cache_dir=model_cache_dir,
device=device,
batch_size=batch_size,
)
self.logger.info(f"Initializing {self.__class__.__name__} with allowed_scores = {self.allowed_scores}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n"
"初始化参数:\n"
"- allowed_scores: 允许通过的分数列表,默认为['Medium', 'High']\n"
"- model_name: 模型名称,默认为'nvidia/quality-classifier-deberta'\n"
"- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n"
"- device: 运行设备,默认为'cuda'\n"
"- batch_size: 批处理大小,默认为16\n\n"
"运行参数:\n"
"- input_key: 输入文本字段名\n"
"- output_key: 输出分数字段名,默认为'Debertav3Score'\n\n"
"过滤逻辑:保留分类结果在allowed_scores列表中的数据"
)
else:
return (
"Filter data using scores from the DebertaV3Scorer. Evaluate text quality using Nvidia Deberta V3 model-based quality classifier.\n\n"
"Initialization Parameters:\n"
"- allowed_scores: List of allowed scores, default is ['Medium', 'High']\n"
"- model_name: Model name, default is 'nvidia/quality-classifier-deberta'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- device: Running device, default is 'cuda'\n"
"- batch_size: Batch size, default is 16\n\n"
"Run Parameters:\n"
"- input_key: Input text field name\n"
"- output_key: Output score field name, default is 'Debertav3Score'\n\n"
"Filter Logic: Keep data with classification results in allowed_scores list"
)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'Debertav3Score'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_key)
dataframe[self.output_key] = scores
labels = np.array([1 if score in self.allowed_scores else 0 for score in scores])
filtered_dataframe = dataframe[labels == 1]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from dataflow.operators.text_pt import FineWebEduSampleEvaluator
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
@OPERATOR_REGISTRY.register()
class FineWebEduFilter(OperatorABC):
def __init__(self, min_score: float = 2.5, max_score: float = 10000, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
self.min_score = min_score
self.max_score = max_score
self.logger = get_logger()
self.scorer = FineWebEduSampleEvaluator(model_cache_dir=model_cache_dir, device=device)
self.filter_name = 'FineWebEduFilter'
self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}, "
f"device = {device}, model_cache_dir = {model_cache_dir}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n"
"初始化参数:\n"
"- min_score: 最低分数阈值,默认为2.5\n"
"- max_score: 最高分数阈值,默认为10000\n"
"- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n"
"- device: 运行设备,默认为'cuda'\n\n"
"运行参数:\n"
"- input_key: 输入文本字段名\n"
"- output_key: 输出分数字段名,默认为'FinewebEduScore'\n\n"
"评分标准:0-5分,分数越高表示文本具有越高的教育价值\n"
"过滤逻辑:保留分数在[min_score, max_score]范围内的数据"
)
else:
return (
"Filter data using scores from the FineWebEduScorer. Fineweb-Edu is a classifier for evaluating educational value of text.\n\n"
"Initialization Parameters:\n"
"- min_score: Minimum score threshold, default is 2.5\n"
"- max_score: Maximum score threshold, default is 10000\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- device: Running device, default is 'cuda'\n\n"
"Run Parameters:\n"
"- input_key: Input text field name\n"
"- output_key: Output score field name, default is 'FinewebEduScore'\n\n"
"Scoring Standard: 0-5 points, higher score indicates more educational content\n"
"Filter Logic: Keep data with scores in [min_score, max_score] range"
)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='FinewebEduScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.filter_name}...")
scores = self.scorer.eval(dataframe, input_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
from dataflow.operators.text_pt import PairQualSampleEvaluator
import numpy as np
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class PairQualFilter(OperatorABC):
def __init__(self, min_score=0, max_score=10000, model_cache_dir='./dataflow_cache', lang='en'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = PairQualSampleEvaluator(model_cache_dir=model_cache_dir, lang=lang)
self.filter_name = 'PairQualFilter'
self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型,使用GPT对文本成对比较打分后训练而成的双语文本质量评分器,得分越高表示质量越高。\n"
"输入参数:\n"
"- min_score:最小质量得分阈值\n"
"- max_score:最大质量得分阈值\n"
"- model_cache_dir:模型缓存目录路径\n"
"- lang:文本语言类型\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留质量得分在指定范围内的文本\n"
"- 返回包含质量得分字段名的列表"
)
else:
return (
"Filter data using scores from the PairQualScorer. Bilingual text quality scorer trained on GPT pairwise comparison annotations using BGE model; higher scores indicate better quality.\n"
"Input Parameters:\n"
"- min_score: Minimum quality score threshold\n"
"- max_score: Maximum quality score threshold\n"
"- model_cache_dir: Model cache directory path\n"
"- lang: Text language type\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with quality score within specified range\n"
"- List containing quality score field name"
)
def eval(self, dataframe, input_key):
self.logger.info(f"Start evaluating {self.filter_name}...")
# Get the scores using the scorer
scores = self.scorer.eval(dataframe, input_key)
# Return the scores for filtering
return np.array(scores)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='PairQualScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.filter_name} with input_key = {self.input_key} and output_key = {self.output_key}...")
scores = np.array(self.scorer.eval(dataframe, input_key))
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_pt import PerplexitySampleEvaluator
@OPERATOR_REGISTRY.register()
class PerplexityFilter(OperatorABC):
def __init__(self, min_score: float = 10.0, max_score: float = 500.0, model_name: str = 'gpt2', device='cuda'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = PerplexitySampleEvaluator(model_name=model_name, device=device)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。\n"
"输入参数:\n"
"- min_score:最小困惑度阈值\n"
"- max_score:最大困惑度阈值\n"
"- model_name:Huggingface模型路径或名称\n"
"- device:模型运行设备\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留困惑度在指定范围内的文本\n"
"- 返回包含困惑度得分字段名的列表"
)
else:
return (
"Filter data using scores from the PerplexityScorer. Uses Huggingface model to calculate text perplexity; lower scores indicate better fluency and understandability.\n"
"Input Parameters:\n"
"- min_score: Minimum perplexity threshold\n"
"- max_score: Maximum perplexity threshold\n"
"- model_name: Huggingface model path or name\n"
"- device: Model device\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with perplexity within specified range\n"
"- List containing perplexity score field name"
)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PerplexityScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
# 获取得分并过滤
scores = np.array(self.scorer.eval(dataframe, self.input_key))
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
# 将过滤后的数据框写回存储
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
from dataflow.operators.text_pt import QuratingSampleEvaluator
import numpy as np
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class QuratingFilter(OperatorABC):
def __init__(self, min_scores: dict = {'writing_style': 0,'required_expertise': 0,'facts_and_trivia': 0,'educational_value': 0}, max_scores: dict = {'writing_style': 9,'required_expertise': 9,'facts_and_trivia': 9,'educational_value': 9},
map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda',
labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'):
self.logger = get_logger()
self.min_scores = min_scores
self.max_scores = max_scores
# Initialize the QuratingScorer with the passed parameters
self.scorer = QuratingSampleEvaluator(map_batch_size=map_batch_size,
num_workers=num_workers, device_batch_size=device_batch_size, device=device,
labels=labels, model_cache_dir=model_cache_dir)
self.logger.info(f"Initializing {self.__class__.__name__} with min_scores = {self.min_scores} and max_scores = {self.max_scores}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量:写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n"
"每个维度评分范围为0-9分,综合判断文本质量,可用于筛选高质量教育类或知识类内容。\n"
"输入参数:\n"
"- min_scores:各维度保留样本的最小分数阈值,默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n"
"- max_scores:各维度保留样本的最大分数阈值,默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n"
"- map_batch_size:映射批次大小,默认为512\n"
"- num_workers:数据加载工作进程数,默认为1\n"
"- device_batch_size:设备批次大小,默认为16\n"
"- device:模型运行设备,默认为'cuda'\n"
"- labels:评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留所有维度分数均在对应阈值范围内的样本\n"
"- 返回包含各维度过滤结果字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter data using scores from the QuratingScorer. Evaluate text quality across four dimensions using Qurating model: writing style, required expertise, facts and trivia content, and educational value.\n"
"Each dimension is scored from 0-9, providing comprehensive quality assessment for filtering high-quality educational or knowledge-based content.\n"
"Input Parameters:\n"
"- min_scores: Minimum score thresholds for each dimension, default is {'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n"
"- max_scores: Maximum score thresholds for each dimension, default is {'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n"
"- map_batch_size: Mapping batch size, default is 512\n"
"- num_workers: Number of data loading workers, default is 1\n"
"- device_batch_size: Device batch size, default is 16\n"
"- device: Model running device, default is 'cuda'\n"
"- labels: List of evaluation dimensions, default is ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with all dimension scores within corresponding threshold ranges\n"
"- List containing field names of each dimension's filtering results for subsequent operator reference"
)
else:
return "Filter data based on multi-dimensional quality assessment using Qurating model."
def run(self, storage: DataFlowStorage, input_key: str):
self.input_key = input_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
# Get the scores for filtering
scores = self.scorer.eval(dataframe, self.input_key)
# Initialize results to all valid (1)
results = np.ones(len(dataframe), dtype=int)
# Iterate over each label to apply the filter and add a column
for label in self.min_scores.keys():
min_score = self.min_scores[label]
max_score = self.max_scores[label]
score_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
metric_scores = np.array(scores[score_key])
# Apply score filter for the current label
metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
results = results & metric_filter.astype(int)
# Add a new column with the name '{label}_filter' containing 0 or 1 based on the filter
dataframe[f"{label}_label"] = metric_filter.astype(int)
# Filter the dataframe based on the results
filtered_dataframe = dataframe[results == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
result = [f"{label}_label" for label in self.min_scores.keys()]
return result
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.text_pt import TextbookSampleEvaluator
@OPERATOR_REGISTRY.register()
class TextbookFilter(OperatorABC):
def __init__(self, min_score=0.99, max_score=1, model_cache_dir:str='./dataflow_cache'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = TextbookSampleEvaluator(model_cache_dir=model_cache_dir)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值,判断文本是否适合作为教材内容。\n"
"分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本,适用于构建教育类数据集。\n"
"输入参数:\n"
"- min_score:保留样本的最小教育价值分数阈值,默认为0.99\n"
"- max_score:保留样本的最大教育价值分数阈值,默认为1.0\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- input_key:输入文本字段名\n"
"- output_key:教育价值分数字段名,默认为'TextbookScore'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留教育价值分数在[min_score, max_score]范围内的样本\n"
"- 返回包含教育价值分数字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter data using scores from the TextbookScorer. Assess educational value of text using FastText classifier to determine if text is suitable as educational material.\n"
"Classifier is trained to identify text with educational significance, clear structure, and accurate knowledge, suitable for building educational datasets.\n"
"Input Parameters:\n"
"- min_score: Minimum educational value score threshold for retaining samples, default is 0.99\n"
"- max_score: Maximum educational value score threshold for retaining samples, default is 1.0\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- input_key: Input text field name\n"
"- output_key: Educational value score field name, default is 'TextbookScore'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with educational value scores within [min_score, max_score] range\n"
"- List containing educational value score field name for subsequent operator reference"
)
else:
return "Filter data based on educational value assessment using FastText textbook classifier."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='TextbookScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment