Commit 19a23d09 authored by wangsen's avatar wangsen
Browse files

Initial commit

parents
Pipeline #1247 failed with stages
in 0 seconds
NUM=$(($(rocm-smi |sed -n '/DCU/,/===/ p'|wc -l)-2))
START=0
if [ $# -gt 0 ];then ##DCU Number
NUM=$1
fi
if [ $# -gt 1 ];then ##The First DCU ID
START=$2
fi
LAST=$((START+NUM-1))
export HIP_VISIBLE_DEVICES=$(seq -s, ${START} ${LAST})
#export ROCBLAS_LAYER=3
#export MIOPEN_ENABLE_LOGGING=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export MIOPEN_LOG_LEVEL=6
export HSA_FORCE_FINE_GRAIN_PCIE=1
logfile=bert_base_${NUM}dcu_bs64_epoch30_`date +%Y%m%d%H%M%S`.log
python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py 2>&1 | tee $logfile
#! -*- coding:utf-8 -*-
# W2NER: https://github.com/ljynlp/W2NER
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.layers import LayerNorm
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import defaultdict, deque
from sklearn.metrics import precision_recall_fscore_support
# 模型参数:训练
epochs = 20 # 训练轮数
steps_per_epoch = 100 # 每轮步数
maxlen = 256 # 最大长度
batch_size = 8 # 根据gpu显存设置
learning_rate = 1e-3
clip_grad_norm = 5.0
bert_learning_rate = 5e-6
warm_factor = 0.1
weight_decay = 0
use_bert_last_4_layers = True
categories = {'LOC':2, 'PER':3, 'ORG':4}
label_num = len(categories) + 2
# 模型参数:网络结构
dist_emb_size = 20
type_emb_size = 20
lstm_hid_size = 512
conv_hid_size = 96
bert_hid_size = 768
biaffine_size = 512
ffnn_hid_size = 288
dilation = [1, 2, 3]
emb_dropout = 0.5
conv_dropout = 0.5
out_dropout = 0.33
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 相对距离设置
dis2idx = np.zeros((1000), dtype='int64')
dis2idx[1] = 1
dis2idx[2:] = 2
dis2idx[4:] = 3
dis2idx[8:] = 4
dis2idx[16:] = 5
dis2idx[32:] = 6
dis2idx[64:] = 7
dis2idx[128:] = 8
dis2idx[256:] = 9
# 用到的小函数
def convert_index_to_text(index, type):
text = "-".join([str(i) for i in index])
text = text + "-#-{}".format(type)
return text
def convert_text_to_index(text):
index, type = text.split("-#-")
index = [int(x) for x in index.split("-")]
return index, int(type)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in tqdm(f.split('\n\n'), desc='Load data'):
if not l:
continue
sentence, d = [], []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
sentence += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
if len(sentence) > maxlen - 2:
continue
tokens = [tokenizer.tokenize(word)[1:-1] for word in sentence[:maxlen-2]]
pieces = [piece for pieces in tokens for piece in pieces]
tokens_ids = [tokenizer._token_start_id] + tokenizer.tokens_to_ids(pieces) + [tokenizer._token_end_id]
assert len(tokens_ids) <= maxlen
length = len(tokens)
# piece和word的对应关系,中文两者一致,除了[CLS]和[SEP]
_pieces2word = np.zeros((length, len(tokens_ids)), dtype=np.bool)
e_start = 0
for i, pieces in enumerate(tokens):
if len(pieces) == 0:
continue
pieces = list(range(e_start, e_start + len(pieces)))
_pieces2word[i, pieces[0] + 1:pieces[-1] + 2] = 1
e_start += len(pieces)
# 相对距离
_dist_inputs = np.zeros((length, length), dtype=np.int)
for k in range(length):
_dist_inputs[k, :] += k
_dist_inputs[:, k] -= k
for i in range(length):
for j in range(length):
if _dist_inputs[i, j] < 0:
_dist_inputs[i, j] = dis2idx[-_dist_inputs[i, j]] + 9
else:
_dist_inputs[i, j] = dis2idx[_dist_inputs[i, j]]
_dist_inputs[_dist_inputs == 0] = 19
# golden标签
_grid_labels = np.zeros((length, length), dtype=np.int)
_grid_mask2d = np.ones((length, length), dtype=np.bool)
for entity in d:
e_start, e_end, e_type = entity[0], entity[1]+1, entity[-1]
if e_end >= maxlen - 2:
continue
index = list(range(e_start, e_end))
for i in range(len(index)):
if i + 1 >= len(index):
break
_grid_labels[index[i], index[i + 1]] = 1
_grid_labels[index[-1], index[0]] = categories[e_type]
_entity_text = set([convert_index_to_text(list(range(e[0], e[1]+1)), categories[e[-1]]) for e in d])
D.append((tokens_ids, _pieces2word, _dist_inputs, _grid_labels, _grid_mask2d, _entity_text))
return D
def collate_fn(data):
tokens_ids, pieces2word, dist_inputs, grid_labels, grid_mask2d, _entity_text = map(list, zip(*data))
sent_length = torch.tensor([i.shape[0] for i in pieces2word], dtype=torch.long, device=device)
# max_wordlen: word长度,非token长度,max_tokenlen:token长度
max_wordlen = torch.max(sent_length).item()
max_tokenlen = np.max([len(x) for x in tokens_ids])
tokens_ids = torch.tensor(sequence_padding(tokens_ids), dtype=torch.long, device=device)
batch_size = tokens_ids.size(0)
def fill(data, new_data):
for j, x in enumerate(data):
new_data[j, :x.shape[0], :x.shape[1]] = torch.tensor(x, dtype=torch.long, device=device)
return new_data
dis_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.long, device=device)
dist_inputs = fill(dist_inputs, dis_mat)
labels_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.long, device=device)
grid_labels = fill(grid_labels, labels_mat)
mask2d_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.bool, device=device)
grid_mask2d = fill(grid_mask2d, mask2d_mat)
sub_mat = torch.zeros((batch_size, max_wordlen, max_tokenlen), dtype=torch.bool, device=device)
pieces2word = fill(pieces2word, sub_mat)
return [tokens_ids, pieces2word, dist_inputs, sent_length, grid_mask2d], [grid_labels, grid_mask2d, _entity_text]
# 加载数据
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class ConvolutionLayer(nn.Module):
'''卷积层
'''
def __init__(self, input_size, channels, dilation, dropout=0.1):
super(ConvolutionLayer, self).__init__()
self.base = nn.Sequential(
nn.Dropout2d(dropout),
nn.Conv2d(input_size, channels, kernel_size=1),
nn.GELU(),
)
self.convs = nn.ModuleList(
[nn.Conv2d(channels, channels, kernel_size=3, groups=channels, dilation=d, padding=d) for d in dilation])
def forward(self, x):
x = x.permute(0, 3, 1, 2).contiguous()
x = self.base(x)
outputs = []
for conv in self.convs:
x = conv(x)
x = F.gelu(x)
outputs.append(x)
outputs = torch.cat(outputs, dim=1)
outputs = outputs.permute(0, 2, 3, 1).contiguous()
return outputs
class Biaffine(nn.Module):
'''仿射变换
'''
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
super(Biaffine, self).__init__()
self.n_in = n_in
self.n_out = n_out
self.bias_x = bias_x
self.bias_y = bias_y
weight = torch.zeros((n_out, n_in + int(bias_x), n_in + int(bias_y)))
nn.init.xavier_normal_(weight)
self.weight = nn.Parameter(weight, requires_grad=True)
def extra_repr(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.bias_x:
s += f", bias_x={self.bias_x}"
if self.bias_y:
s += f", bias_y={self.bias_y}"
return s
def forward(self, x, y):
if self.bias_x:
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
if self.bias_y:
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
s = s.permute(0, 2, 3, 1)
return s
class MLP(nn.Module):
'''MLP全连接
'''
def __init__(self, n_in, n_out, dropout=0):
super().__init__()
self.linear = nn.Linear(n_in, n_out)
self.activation = nn.GELU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.dropout(x)
x = self.linear(x)
x = self.activation(x)
return x
class CoPredictor(nn.Module):
def __init__(self, cls_num, hid_size, biaffine_size, channels, ffnn_hid_size, dropout=0):
super().__init__()
self.mlp1 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.mlp2 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.biaffine = Biaffine(n_in=biaffine_size, n_out=cls_num, bias_x=True, bias_y=True)
self.mlp_rel = MLP(channels, ffnn_hid_size, dropout=dropout)
self.linear = nn.Linear(ffnn_hid_size, cls_num)
self.dropout = nn.Dropout(dropout)
def forward(self, x, y, z):
h = self.dropout(self.mlp1(x))
t = self.dropout(self.mlp2(y))
o1 = self.biaffine(h, t)
z = self.dropout(self.mlp_rel(z))
o2 = self.linear(z)
return o1 + o2
class Model(BaseModel):
def __init__(self, use_bert_last_4_layers=False):
super().__init__()
self.use_bert_last_4_layers = use_bert_last_4_layers
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, # segment_vocab_size=0,
output_all_encoded_layers = True if use_bert_last_4_layers else False)
lstm_input_size = self.bert.configs['hidden_size']
self.dis_embs = nn.Embedding(20, dist_emb_size)
self.reg_embs = nn.Embedding(3, type_emb_size)
self.encoder = nn.LSTM(lstm_input_size, lstm_hid_size // 2, num_layers=1, batch_first=True,
bidirectional=True)
conv_input_size = lstm_hid_size + dist_emb_size + type_emb_size
self.convLayer = ConvolutionLayer(conv_input_size, conv_hid_size, dilation, conv_dropout)
self.dropout = nn.Dropout(emb_dropout)
self.predictor = CoPredictor(label_num, lstm_hid_size, biaffine_size,
conv_hid_size * len(dilation), ffnn_hid_size, out_dropout)
self.cln = LayerNorm(lstm_hid_size, conditional_size=lstm_hid_size)
def forward(self, token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d):
bert_embs = self.bert([token_ids, torch.zeros_like(token_ids)])
if self.use_bert_last_4_layers:
bert_embs = torch.stack(bert_embs[-4:], dim=-1).mean(-1)
length = pieces2word.size(1)
min_value = torch.min(bert_embs).item()
# 最大池化
_bert_embs = bert_embs.unsqueeze(1).expand(-1, length, -1, -1)
_bert_embs = torch.masked_fill(_bert_embs, pieces2word.eq(0).unsqueeze(-1), min_value)
word_reps, _ = torch.max(_bert_embs, dim=2)
# LSTM
word_reps = self.dropout(word_reps)
packed_embs = pack_padded_sequence(word_reps, sent_length.cpu(), batch_first=True, enforce_sorted=False)
packed_outs, (hidden, _) = self.encoder(packed_embs)
word_reps, _ = pad_packed_sequence(packed_outs, batch_first=True, total_length=sent_length.max())
# 条件LayerNorm
cln = self.cln([word_reps.unsqueeze(2), word_reps])
# concat
dis_emb = self.dis_embs(dist_inputs)
tril_mask = torch.tril(grid_mask2d.clone().long())
reg_inputs = tril_mask + grid_mask2d.clone().long()
reg_emb = self.reg_embs(reg_inputs)
conv_inputs = torch.cat([dis_emb, reg_emb, cln], dim=-1)
# 卷积层
conv_inputs = torch.masked_fill(conv_inputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
conv_outputs = self.convLayer(conv_inputs)
conv_outputs = torch.masked_fill(conv_outputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
# 输出层
outputs = self.predictor(word_reps, word_reps, conv_outputs)
return outputs
model = Model(use_bert_last_4_layers).to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
grid_labels, grid_mask2d, _ = labels
grid_mask2d = grid_mask2d.clone()
return super().forward(outputs[grid_mask2d], grid_labels[grid_mask2d])
bert_params = set(model.bert.parameters())
other_params = list(set(model.parameters()) - bert_params)
no_decay = ['bias', 'LayerNorm.weight']
params = [
{'params': [p for n, p in model.bert.named_parameters() if not any(nd in n for nd in no_decay)],
'lr': bert_learning_rate,
'weight_decay': weight_decay},
{'params': [p for n, p in model.bert.named_parameters() if any(nd in n for nd in no_decay)],
'lr': bert_learning_rate,
'weight_decay': 0.0},
{'params': other_params,
'lr': learning_rate,
'weight_decay': weight_decay},
]
optimizer = optim.Adam(params, lr=learning_rate, weight_decay=weight_decay)
updates_total = (len(train_dataloader) if steps_per_epoch is None else steps_per_epoch) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_factor * updates_total, num_training_steps=updates_total)
model.compile(loss=Loss(), optimizer=optimizer, scheduler=scheduler, clip_grad_norm=5.0)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, p, r, e_f1, e_p, e_r = self.evaluate(valid_dataloader)
if e_f1 > self.best_val_f1:
self.best_val_f1 = e_f1
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {p:.5f} r: {r:.5f}')
print(f'[val-entity level] f1: {e_f1:.5f}, p: {e_p:.5f} r: {e_r:.5f} best_f1: {self.best_val_f1:.5f}\n')
def evaluate(self, data_loader):
def cal_f1(c, p, r):
if r == 0 or p == 0:
return 0, 0, 0
r = c / r if r else 0
p = c / p if p else 0
if r and p:
return 2 * p * r / (p + r), p, r
return 0, p, r
pred_result = []
label_result = []
total_ent_r = 0
total_ent_p = 0
total_ent_c = 0
for data_batch in tqdm(data_loader, desc='Evaluate'):
(token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d), (grid_labels, grid_mask2d, entity_text) = data_batch
outputs = model.predict([token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d])
grid_mask2d = grid_mask2d.clone()
outputs = torch.argmax(outputs, -1)
ent_c, ent_p, ent_r, _ = self.decode(outputs.cpu().numpy(), entity_text, sent_length.cpu().numpy())
total_ent_r += ent_r
total_ent_p += ent_p
total_ent_c += ent_c
grid_labels = grid_labels[grid_mask2d].contiguous().view(-1)
outputs = outputs[grid_mask2d].contiguous().view(-1)
label_result.append(grid_labels.cpu())
pred_result.append(outputs.cpu())
label_result = torch.cat(label_result)
pred_result = torch.cat(pred_result)
p, r, f1, _ = precision_recall_fscore_support(label_result.numpy(), pred_result.numpy(), average="macro")
e_f1, e_p, e_r = cal_f1(total_ent_c, total_ent_p, total_ent_r)
return f1, p, r, e_f1, e_p, e_r
def decode(self, outputs, entities, length):
class Node:
def __init__(self):
self.THW = [] # [(tail, type)]
self.NNW = defaultdict(set) # {(head,tail): {next_index}}
ent_r, ent_p, ent_c = 0, 0, 0
decode_entities = []
q = deque()
for instance, ent_set, l in zip(outputs, entities, length):
predicts = []
nodes = [Node() for _ in range(l)]
count = 0
for cur in reversed(range(l)):
# if count >= 29:
# print(count)
count += 1
heads = []
for pre in range(cur+1):
# THW
if instance[cur, pre] > 1:
nodes[pre].THW.append((cur, instance[cur, pre]))
heads.append(pre)
# NNW
if pre < cur and instance[pre, cur] == 1:
# cur node
for head in heads:
nodes[pre].NNW[(head,cur)].add(cur)
# post nodes
for head,tail in nodes[cur].NNW.keys():
if tail >= cur and head <= pre:
nodes[pre].NNW[(head,tail)].add(cur)
# entity
for tail,type_id in nodes[cur].THW:
if cur == tail:
predicts.append(([cur], type_id))
continue
q.clear()
q.append([cur])
while len(q) > 0:
chains = q.pop()
for idx in nodes[chains[-1]].NNW[(cur,tail)]:
if idx == tail:
predicts.append((chains + [idx], type_id))
else:
q.append(chains + [idx])
predicts = set([convert_index_to_text(x[0], x[1]) for x in predicts])
decode_entities.append([convert_text_to_index(x) for x in predicts])
ent_r += len(ent_set)
ent_p += len(predicts)
ent_c += len(predicts.intersection(ent_set))
return ent_c, ent_p, ent_r, decode_entities
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf 级联方法,一阶段识别BIO,二阶段识别对应的分类
# 参考博客:https://zhuanlan.zhihu.com/p/166496466
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 98.11; entity_level: 96.23
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['LOC', 'PER', 'ORG']
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels, batch_entity_ids, batch_entity_labels = [], [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
entity_ids, entity_labels = [], []
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = 1 # 标记B
labels[start + 1:end + 1] = 2 # 标记I
entity_ids.append([start, end])
entity_labels.append(categories.index(label)+1)
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
entity_labels.append(0)
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_entity_ids.append(entity_ids)
batch_entity_labels.append(entity_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids], [batch_labels, batch_entity_labels]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dense1 = nn.Linear(768, len(categories))
self.dense2 = nn.Linear(768, len(categories)+1) # 包含padding
self.crf = CRF(len(categories))
def forward(self, inputs):
# 一阶段的输出
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
# 二阶段输出
btz, entity_count, _ = entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
return emission_score, attention_mask, entity_logit
def predict(self, token_ids):
self.eval()
with torch.no_grad():
# 一阶段推理
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
# 二阶段推理
batch_entity_ids = []
for one_samp in best_path:
entity_ids = []
for j, item in enumerate(one_samp):
if item.item() == 1: # B
entity_ids.append([j, j])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and (item.item() == 2): # I
entity_ids[-1][-1] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
batch_entity_ids.append([i for i in entity_ids if i])
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
btz, entity_count, _ = batch_entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
gather_index = batch_entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=gather_index).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
entity_pred = torch.argmax(entity_logit, dim=-1) # [btz, 实体个数]
# 每个元素为一个三元组
entity_tulpe = trans_entity2tuple(batch_entity_ids, entity_pred)
return best_path, entity_tulpe
model = Model().to(device)
class Loss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.loss2 = nn.CrossEntropyLoss(ignore_index=0)
def forward(self, outputs, labels):
emission_score, attention_mask, entity_logit = outputs
seq_labels, entity_labels = labels
loss1 = model.crf(emission_score, attention_mask, seq_labels)
loss2 = self.loss2(entity_logit.reshape(-1, entity_logit.shape[-1]), entity_labels.flatten())
return {'loss': loss1+loss2, 'loss1': loss1, 'loss2': loss2}
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印loss1和loss2
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X1, Y1, Z1 = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, entity_ids), (label, entity_labels) in tqdm(data):
scores, entity_pred = model.predict(token_ids) # [btz, seq_len]
# 一阶段指标: token粒度
attention_mask = label.gt(0)
X1 += (scores.eq(label) * attention_mask).sum().item()
Y1 += scores.gt(0).sum().item()
Z1 += label.gt(0).sum().item()
# 二阶段指标:entity粒度
entity_true = trans_entity2tuple(entity_ids, entity_labels)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X1 / (Y1 + Z1), X1 / Y1, X1 / Z1
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(entity_ids, entity_labels):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
entity_true = set()
for i, one_sample in enumerate(entity_ids):
for j, item in enumerate(one_sample):
if item[0].item() * item[1].item() != 0:
entity_true.add((i, item[0].item(), item[1].item(), entity_labels[i, j].item()))
return entity_true
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-1阶段] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-2阶段] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import time
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
def acc(y_pred, y_true):
y_pred = y_pred[0]
y_pred = torch.argmax(y_pred, dim=-1)
acc = torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
return {'acc': acc}
# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=acc)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
logs["f1"] = f2
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
# time_fw写入程序开始执行的时间
time_fw.write('End Time: {:.6f}\n'.format(time.time()))
time_fw.flush()
time_fw.close()
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 增加词性作为额外的embedding
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.30; entity_level: 96.09
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import jieba.posseg as psg
from collections import Counter
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
psg_map = {v: i+1 for i, v in enumerate(['a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'mg', 'mq', 'n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 't',
'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg'])}
def collate_fn(batch):
batch_token_ids, batch_psg_ids, batch_labels = [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens) # 第i个token在原始text中的区间
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
# 处理词性输入
seg = [(i, p) for word, p in psg.cut(d[0]) for i in word]
seg_word, seg_p = zip(*seg)
psg_ids = np.zeros(len(token_ids))
for i, j in enumerate(mapping):
if j:
start, end = j[0], j[-1] # token在原始text的首尾位置
token_new = (''.join(seg_word[start:end+1])).lower()
assert tokens[i] == token_new, f"{tokens[i]} -> {token_new}"
if start == end:
psg_ids[i] = psg_map.get(seg_p[start], 0) # 不在字典里给0
else:
psg_ids[i] = psg_map.get(Counter(seg_p[start:end+1]).most_common(1)[0][0], 0) # 取众数
batch_psg_ids.append(psg_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_psg_ids = torch.tensor(sequence_padding(batch_psg_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return [batch_token_ids, batch_psg_ids], batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
layer_add_embs = nn.Embedding(len(psg_map)+1, 768)
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0,
layer_add_embs=layer_add_embs)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories))
def forward(self, token_ids, psg_ids):
sequence_output = self.bert([token_ids, psg_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
return emission_score, attention_mask
def predict(self, token_ids, psg_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids, psg_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, psg_ids), label in tqdm(data):
scores = model.predict(token_ids, psg_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 测试两种方案,一种是用数据集来生成crf权重,第二种是来初始化
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 初始化: [valid_f1] token_level: 97.35; entity_level: 96.42
# 固定化: [valid_f1] token_level: 96.92; entity_level: 95.42
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train')
valid_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev')
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=valid_data), batch_size=batch_size, collate_fn=collate_fn)
# 根据训练数据生成权重
transition = np.zeros((len(categories), len(categories)))
start_transition = np.zeros(len(categories))
end_transition = np.zeros(len(categories))
for d in tqdm(train_data, desc='Generate init_trasitions'):
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
for i in range(len(labels)-1):
transition[int(labels[i]), int(labels[i+1])] += 1
start_transition[int(labels[0])] += 1 # start转移到标签
end_transition[int(labels[-1])] += 1 # 标签转移到end
max_v = np.max([np.max(transition), np.max(start_transition), np.max(end_transition)])
min_v = np.min([np.min(transition), np.min(start_transition), np.min(end_transition)])
transition = (transition - min_v) / (max_v - min_v)
start_transition = (start_transition - min_v) / (max_v - min_v)
end_transition = (end_transition - min_v) / (max_v - min_v)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories), init_transitions=[transition, start_transition, end_transition], freeze=True) # 控制是否初始化,是否参加训练
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# efficient_global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 96.55
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import EfficientGlobalPointer
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = EfficientGlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0.5):
X, Y, Z, threshold = 1e-10, 1e-10, 1e-10, 0
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 95.66
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import GlobalPointer
import random
import os
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = GlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0):
X, Y, Z = 0, 1e-10, 1e-10
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# mrc阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 95.75
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from collections import defaultdict
max_c_len = 224
max_q_len = 32
batch_size = 6 # 真实的batch_size是 batch_size * 实体类型数
categories = ['LOC', 'PER', 'ORG']
ent2query = {"LOC": "找出下述句子中的地址名",
"PER": "找出下述句子中的人名",
"ORG": "找出下述句子中的机构名"}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_start_labels, batch_end_labels = [], [], [], []
batch_ent_type = []
for d in batch:
tokens_b = tokenizer.tokenize(d[0], maxlen=max_c_len)[1:] # 不保留[CLS]
mapping = tokenizer.rematch(d[0], tokens_b)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
# 按照实体类型整理实体
label_dict = defaultdict(list)
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label_dict[label].append((start, end))
# 遍历实体类型,query为tokens_a, context为tokens_b
# 样本组成:[CLS] + tokens_a + [SEP] + tokens_b + [SEP]
for _type in categories:
start_ids = [0] * len(tokens_b)
end_ids = [0] * len(tokens_b)
text_a = ent2query[_type]
tokens_a = tokenizer.tokenize(text_a, maxlen=max_q_len)
for _label in label_dict[_type]:
start_ids[_label[0]] = 1
end_ids[_label[1]] = 1
start_ids = [0] * len(tokens_a) + start_ids
end_ids = [0] * len(tokens_a) + end_ids
token_ids = tokenizer.tokens_to_ids(tokens_a) + tokenizer.tokens_to_ids(tokens_b)
segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b)
assert len(start_ids) == len(end_ids) == len(token_ids) == len(segment_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_start_labels.append(start_ids)
batch_end_labels.append(end_ids)
batch_ent_type.append(_type)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_start_labels = torch.tensor(sequence_padding(batch_start_labels), dtype=torch.long, device=device)
batch_end_labels = torch.tensor(sequence_padding(batch_end_labels), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_segment_ids, batch_start_labels, batch_end_labels, batch_ent_type]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path)
self.mid_linear = nn.Sequential(
nn.Linear(768, 128),
nn.ReLU(),
nn.Dropout(0.1)
)
self.start_fc = nn.Linear(128, 2)
self.end_fc = nn.Linear(128, 2)
def forward(self, token_ids, segment_ids):
sequence_output = self.bert([token_ids, segment_ids]) # [bts, seq_len, hdsz]
seq_out = self.mid_linear(sequence_output) # [bts, seq_len, mid_dims]
start_logits = self.start_fc(seq_out) # [bts, seq_len, 2]
end_logits = self.end_fc(seq_out) # [bts, seq_len, 2]
return start_logits, end_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
start_logits, end_logits = outputs
mask, start_ids, end_ids = labels[:3]
start_logits = start_logits.view(-1, 2)
end_logits = end_logits.view(-1, 2)
# 去掉 text_a 和 padding 部分的标签,计算真实 loss
active_loss = mask.view(-1) == 1
active_start_logits = start_logits[active_loss]
active_end_logits = end_logits[active_loss]
active_start_labels = start_ids.view(-1)[active_loss]
active_end_labels = end_ids.view(-1)[active_loss]
start_loss = super().forward(active_start_logits, active_start_labels)
end_loss = super().forward(active_end_logits, active_end_labels)
return start_loss + end_loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 0, 1e-10, 1e-10
for (token_ids, segment_ids), labels in tqdm(data, desc='Evaluation'):
start_logit, end_logit = model.predict([token_ids, segment_ids]) # [btz, seq_len, 2]
mask, start_ids, end_ids, ent_type = labels
# entity粒度
entity_pred = mrc_decode(start_logit, end_logit, ent_type, mask)
entity_true = mrc_decode(start_ids, end_ids, ent_type)
X += len(entity_pred.intersection(entity_true))
Y += len(entity_pred)
Z += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X/ Y, X / Z
return f1, precision, recall
# 严格解码 baseline
def mrc_decode(start_preds, end_preds, ent_type, mask=None):
'''返回实体的start, end
'''
predict_entities = set()
if mask is not None: # 预测的把query和padding部分mask掉
start_preds = torch.argmax(start_preds, -1) * mask
end_preds = torch.argmax(end_preds, -1) * mask
start_preds = start_preds.cpu().numpy()
end_preds = end_preds.cpu().numpy()
for bt_i in range(start_preds.shape[0]):
start_pred = start_preds[bt_i]
end_pred = end_preds[bt_i]
# 统计每个样本的结果
for i, s_type in enumerate(start_pred):
if s_type == 0:
continue
for j, e_type in enumerate(end_pred[i:]):
if s_type == e_type:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities.add((bt_i, i, i+j, ent_type[bt_i]))
break
return predict_entities
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# span阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 96.31
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.losses import FocalLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
max_len = 256
batch_size = 16
categories = ['LOC', 'PER', 'ORG']
categories_id2label = {i: k for i, k in enumerate(categories, start=1)}
categories_label2id = {k: i for i, k in enumerate(categories, start=1)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_start_labels, batch_end_labels = [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=max_len)[1:] # 不保留[CLS]
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
start_ids = [0] * len(tokens)
end_ids = [0] * len(tokens)
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
start_ids[start] = categories_label2id[label]
end_ids[end] = categories_label2id[label]
batch_token_ids.append(token_ids)
batch_start_labels.append(start_ids)
batch_end_labels.append(end_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_start_labels = torch.tensor(sequence_padding(batch_start_labels), dtype=torch.long, device=device)
batch_end_labels = torch.tensor(sequence_padding(batch_end_labels), dtype=torch.long, device=device)
batch_mask = batch_token_ids.gt(0).long()
return [batch_token_ids], [batch_mask, batch_start_labels, batch_end_labels]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.mid_linear = nn.Sequential(
nn.Linear(768, 128),
nn.ReLU(),
nn.Dropout(0.1)
)
self.start_fc = nn.Linear(128, len(categories)+1) # 0表示没有
self.end_fc = nn.Linear(128, len(categories)+1)
def forward(self, token_ids):
sequence_output = self.bert(token_ids) # [bts, seq_len, hdsz]
seq_out = self.mid_linear(sequence_output) # [bts, seq_len, mid_dims]
start_logits = self.start_fc(seq_out) # [bts, seq_len, num_tags]
end_logits = self.end_fc(seq_out) # [bts, seq_len, num_tags]
return start_logits, end_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
start_logits, end_logits = outputs
mask, start_ids, end_ids = labels
start_logits = start_logits.view(-1, len(categories)+1)
end_logits = end_logits.view(-1, len(categories)+1)
# 去掉padding部分的标签,计算真实 loss
active_loss = mask.view(-1) == 1
active_start_logits = start_logits[active_loss]
active_end_logits = end_logits[active_loss]
active_start_labels = start_ids.view(-1)[active_loss]
active_end_labels = end_ids.view(-1)[active_loss]
start_loss = super().forward(active_start_logits, active_start_labels)
end_loss = super().forward(active_end_logits, active_end_labels)
return start_loss + end_loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 0, 1e-10, 1e-10
for token_ids, labels in tqdm(data, desc='Evaluation'):
start_logit, end_logit = model.predict(token_ids) # [btz, seq_len, 2]
mask, start_ids, end_ids = labels
# entity粒度
entity_pred = span_decode(start_logit, end_logit, mask)
entity_true = span_decode(start_ids, end_ids)
X += len(entity_pred.intersection(entity_true))
Y += len(entity_pred)
Z += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X/ Y, X / Z
return f1, precision, recall
# 严格解码 baseline
def span_decode(start_preds, end_preds, mask=None):
'''返回实体的start, end
'''
predict_entities = set()
if mask is not None: # 把padding部分mask掉
start_preds = torch.argmax(start_preds, -1) * mask
end_preds = torch.argmax(end_preds, -1) * mask
start_preds = start_preds.cpu().numpy()
end_preds = end_preds.cpu().numpy()
for bt_i in range(start_preds.shape[0]):
start_pred = start_preds[bt_i]
end_pred = end_preds[bt_i]
# 统计每个样本的结果
for i, s_type in enumerate(start_pred):
if s_type == 0:
continue
for j, e_type in enumerate(end_pred[i:]):
if s_type == e_type:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities.add((bt_i, i, i+j, categories_id2label[s_type]))
break
return predict_entities
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# tplinker_plus用来做实体识别
# [valid_f1]: 95.71
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import TplinkerHandshakingKernel
maxlen = 64
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
text_list = tokenizer.tokenize(text)[1:-1] #不保留首位[CLS]和末位[SEP]
tokens = [j for i in text_list for j in i][:maxlen] # 以char为单位
data.append((tokens, label)) # label为[[start, end, entity], ...]
return data
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def tran_ent_rel2id():
'''获取最后一个分类层的的映射关系
'''
tag2id = {}
for p in categories_label2id.keys():
tag2id[p] = len(tag2id)
return tag2id
tag2id = tran_ent_rel2id()
id2tag = {v: k for k, v in tag2id.items()}
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels = torch.zeros((len(batch), pair_len, len(tag2id)), dtype=torch.long, device=device)
batch_token_ids = []
for i, (tokens, labels) in enumerate(batch):
batch_token_ids.append(tokenizer.tokens_to_ids(tokens)) # 前面已经限制了长度
for s_i in labels:
if s_i[1] >= len(tokens): # 实体的结尾超过文本长度,则不标记
continue
batch_labels[i, map_ij2k[s_i[0], s_i[1]], tag2id[s_i[2]]] = 1
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(tag2id))
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cln_plus', inner_enc_type='lstm')
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state)
output = self.fc(shaking_hiddens) # [btz, pair_len, tag_size]
return output
model = Model().to(device)
model.compile(loss=MultilabelCategoricalCrossentropy(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0):
X, Y, Z, threshold = 0, 1e-10, 1e-10, 0
for x_true, label in data:
scores = model.predict(x_true) # [btz, pair_len, tag_size]
for i, score in enumerate(scores):
R = set()
for pair_id, tag_id in zip(*np.where(score.cpu().numpy() > threshold)):
start, end = map_k2ij[pair_id][0], map_k2ij[pair_id][1]
R.add((start, end, tag_id))
T = set()
for pair_id, tag_id in zip(*np.where(label[i].cpu().numpy() > threshold)):
start, end = map_k2ij[pair_id][0], map_k2ij[pair_id][1]
T.add((start, end, tag_id))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
import argparse
import collections
import json
import os
import pickle
import torch
import logging
import shutil
from tqdm import tqdm
import time
logger = logging.Logger('log')
def get_path_from_url(url, root_dir, check_exist=True, decompress=True):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url and decompress it, return the path.
Args:
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME or DATASET_HOME
decompress (bool): decompress zip or tar file. Default is `True`
Returns:
str: a local path to save downloaded models & weights & datasets.
"""
import os.path
import os
import tarfile
import zipfile
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') or path.startswith('https://')
def _map_path(url, root_dir):
# parse path after download under root_dir
fname = os.path.split(url)[-1]
fpath = fname
return os.path.join(root_dir, fpath)
def _get_download(url, fullname):
import requests
# using requests.get method
fname = os.path.basename(fullname)
try:
req = requests.get(url, stream=True)
except Exception as e: # requests.exceptions.ConnectionError
logger.info("Downloading {} from {} failed with exception {}".format(
fname, url, str(e)))
return False
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
for chunk in req.iter_content(chunk_size=1024):
f.write(chunk)
pbar.update(1)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _download(url, path):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.split(url)[-1]
fullname = os.path.join(path, fname)
retry_cnt = 0
logger.info("Downloading {} from {}".format(fname, url))
DOWNLOAD_RETRY_LIMIT = 3
while not os.path.exists(fullname):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
if not _get_download(url, fullname):
time.sleep(1)
continue
return fullname
def _uncompress_file_zip(filepath):
with zipfile.ZipFile(filepath, 'r') as files:
file_list = files.namelist()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
# `strip(os.sep)` to remove `os.sep` in the tail of path
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _is_a_single_file(file_list):
if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
return True
return False
def _is_a_single_dir(file_list):
new_file_list = []
for file_path in file_list:
if '/' in file_path:
file_path = file_path.replace('/', os.sep)
elif '\\' in file_path:
file_path = file_path.replace('\\', os.sep)
new_file_list.append(file_path)
file_name = new_file_list[0].split(os.sep)[0]
for i in range(1, len(new_file_list)):
if file_name != new_file_list[i].split(os.sep)[0]:
return False
return True
def _uncompress_file_tar(filepath, mode="r:*"):
with tarfile.open(filepath, mode) as files:
file_list = files.getnames()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _decompress(fname):
"""
Decompress for zip and tar file
"""
logger.info("Decompressing {}...".format(fname))
# For protecting decompressing interupted,
# decompress to fpath_tmp directory firstly, if decompress
# successed, move decompress files to fpath and delete
# fpath_tmp and remove download compress file.
if tarfile.is_tarfile(fname):
uncompressed_path = _uncompress_file_tar(fname)
elif zipfile.is_zipfile(fname):
uncompressed_path = _uncompress_file_zip(fname)
else:
raise TypeError("Unsupport compress file type {}".format(fname))
return uncompressed_path
assert is_url(url), "downloading from {} not a url".format(url)
fullpath = _map_path(url, root_dir)
if os.path.exists(fullpath) and check_exist:
logger.info("Found {}".format(fullpath))
else:
fullpath = _download(url, root_dir)
if decompress and (tarfile.is_tarfile(fullpath) or
zipfile.is_zipfile(fullpath)):
fullpath = _decompress(fullpath)
return fullpath
MODEL_MAP = {
"uie-base": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
}
},
"uie-medium": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-mini": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-micro": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-nano": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-medical-base": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medical_base_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-tiny": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/tokenizer_config.json"
}
}
}
def build_params_map(attention_num=12):
"""
build params map from paddle-paddle's ERNIE to transformer's BERT
:return:
"""
weight_map = collections.OrderedDict({
'encoder.embeddings.word_embeddings.weight': "bert.embeddings.word_embeddings.weight",
'encoder.embeddings.position_embeddings.weight': "bert.embeddings.position_embeddings.weight",
'encoder.embeddings.token_type_embeddings.weight': "bert.embeddings.token_type_embeddings.weight",
'encoder.embeddings.task_type_embeddings.weight': "embeddings.task_type_embeddings.weight", # 这里没有前缀bert,直接映射到bert4torch结构
'encoder.embeddings.layer_norm.weight': 'bert.embeddings.LayerNorm.weight',
'encoder.embeddings.layer_norm.bias': 'bert.embeddings.LayerNorm.bias',
})
# add attention layers
for i in range(attention_num):
weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.query.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.query.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.key.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.key.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.value.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.value.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.weight'] = f'bert.encoder.layer.{i}.attention.output.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.bias'] = f'bert.encoder.layer.{i}.attention.output.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.norm1.weight'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.weight'
weight_map[f'encoder.encoder.layers.{i}.norm1.bias'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.bias'
weight_map[f'encoder.encoder.layers.{i}.linear1.weight'] = f'bert.encoder.layer.{i}.intermediate.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.linear1.bias'] = f'bert.encoder.layer.{i}.intermediate.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.linear2.weight'] = f'bert.encoder.layer.{i}.output.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.linear2.bias'] = f'bert.encoder.layer.{i}.output.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.norm2.weight'] = f'bert.encoder.layer.{i}.output.LayerNorm.weight'
weight_map[f'encoder.encoder.layers.{i}.norm2.bias'] = f'bert.encoder.layer.{i}.output.LayerNorm.bias'
# add pooler
weight_map.update(
{
'encoder.pooler.dense.weight': 'bert.pooler.dense.weight',
'encoder.pooler.dense.bias': 'bert.pooler.dense.bias',
'linear_start.weight': 'linear_start.weight',
'linear_start.bias': 'linear_start.bias',
'linear_end.weight': 'linear_end.weight',
'linear_end.bias': 'linear_end.bias',
}
)
return weight_map
def extract_and_convert(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info('=' * 20 + 'save config file' + '=' * 20)
config = json.load(open(os.path.join(input_dir, 'model_config.json'), 'rt', encoding='utf-8'))
config = config['init_args'][0]
config["architectures"] = ["UIE"]
config['layer_norm_eps'] = 1e-12
del config['init_class']
if 'sent_type_vocab_size' in config:
config['type_vocab_size'] = config['sent_type_vocab_size']
config['intermediate_size'] = 4 * config['hidden_size']
json.dump(config, open(os.path.join(output_dir, 'config.json'),
'wt', encoding='utf-8'), indent=4)
logger.info('=' * 20 + 'save vocab file' + '=' * 20)
with open(os.path.join(input_dir, 'vocab.txt'), 'rt', encoding='utf-8') as f:
words = f.read().splitlines()
words_set = set()
words_duplicate_indices = []
for i in range(len(words)-1, -1, -1):
word = words[i]
if word in words_set:
words_duplicate_indices.append(i)
words_set.add(word)
for i, idx in enumerate(words_duplicate_indices):
words[idx] = chr(0x1F6A9+i) # Change duplicated word to 🚩 LOL
with open(os.path.join(output_dir, 'vocab.txt'), 'wt', encoding='utf-8') as f:
for word in words:
f.write(word+'\n')
special_tokens_map = {
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]"
}
json.dump(special_tokens_map, open(os.path.join(output_dir, 'special_tokens_map.json'),
'wt', encoding='utf-8'))
tokenizer_config = {
"do_lower_case": True,
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"tokenizer_class": "BertTokenizer"
}
json.dump(tokenizer_config, open(os.path.join(output_dir, 'tokenizer_config.json'),
'wt', encoding='utf-8'))
logger.info('=' * 20 + 'extract weights' + '=' * 20)
state_dict = collections.OrderedDict()
weight_map = build_params_map(attention_num=config['num_hidden_layers'])
paddle_paddle_params = pickle.load(
open(os.path.join(input_dir, 'model_state.pdparams'), 'rb'))
del paddle_paddle_params['StructuredToParameterName@@']
for weight_name, weight_value in paddle_paddle_params.items():
if 'weight' in weight_name:
if 'encoder.encoder' in weight_name or 'pooler' in weight_name or 'linear' in weight_name:
weight_value = weight_value.transpose()
# Fix: embedding error
if 'word_embeddings.weight' in weight_name:
weight_value[0, :] = 0
if weight_name not in weight_map:
logger.info(f"{'='*20} [SKIP] {weight_name} {'='*20}")
continue
state_dict[weight_map[weight_name]] = torch.FloatTensor(weight_value)
logger.info(f"{weight_name} -> {weight_map[weight_name]} {weight_value.shape}")
torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
def check_model(input_model):
if not os.path.exists(input_model):
if input_model not in MODEL_MAP:
raise ValueError('input_model not exists!')
resource_file_urls = MODEL_MAP[input_model]['resource_file_urls']
logger.info("Downloading resource files...")
for key, val in resource_file_urls.items():
file_path = os.path.join(input_model, key)
if not os.path.exists(file_path):
get_path_from_url(val, input_model)
def do_main():
check_model(args.input_model)
extract_and_convert(args.input_model, args.output_model)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_model", default="uie-base", type=str,
help="Directory of input paddle model.\n Will auto download model [uie-base/uie-tiny]")
parser.add_argument("-o", "--output_model", default="uie_base_pytorch", type=str,
help="Directory of output pytorch model")
args = parser.parse_args()
do_main()
# 数据生成1
python finetune_step1_dataprocess.py
# 数据生成2
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/train.json \
--task_type "ext" \
--splits 1.0 0.0 0.0 \
--save_dir ./data/final_data/ \
--negative_ratio 3
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/dev.json \
--task_type "ext" \
--splits 0.0 1.0 0.0 \
--save_dir ./data/final_data/ \
--negative_ratio 0
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/test.json \
--task_type "ext" \
--splits 0.0 0.0 1.0 \
--save_dir ./data/final_data/ \
--negative_ratio 0
# finetune训练
python finetune_step3_train.py
\ No newline at end of file
# 数据转换1
import os
import re
import json
en2ch = {
'ORG':'机构',
'PER':'人名',
'LOC':'籍贯'
}
def preprocess(input_path, save_path, mode):
if not os.path.exists(save_path):
os.makedirs(save_path)
data_path = os.path.join(save_path, mode + ".json")
result = []
tmp = {}
tmp['id'] = 0
tmp['text'] = ''
tmp['relations'] = []
tmp['entities'] = []
# =======先找出句子和句子中的所有实体和类型=======
with open(input_path,'r',encoding='utf-8') as fp:
lines = fp.readlines()
texts = []
entities = []
words = []
entity_tmp = []
entities_tmp = []
entity_label = ''
for line in lines:
line = line.strip().split(" ")
if len(line) == 2:
word = line[0]
label = line[1]
words.append(word)
if "B-" in label:
entity_tmp.append(word)
entity_label = en2ch[label.split("-")[-1]]
elif "I-" in label:
entity_tmp.append(word)
if (label == 'O') and entity_tmp:
if ("".join(entity_tmp), entity_label) not in entities_tmp:
entities_tmp.append(("".join(entity_tmp), entity_label))
entity_tmp, entity_label = [], ''
else:
if entity_tmp and (("".join(entity_tmp), entity_label) not in entities_tmp):
entities_tmp.append(("".join(entity_tmp), entity_label))
entity_tmp, entity_label = [], ''
texts.append("".join(words))
entities.append(entities_tmp)
words = []
entities_tmp = []
# ==========================================
# =======找出句子中实体的位置=======
i = 0
for text,entity in zip(texts, entities):
if entity:
ltmp = []
for ent,type in entity:
for span in re.finditer(ent, text):
start = span.start()
end = span.end()
ltmp.append((type, start, end, ent))
# print(ltmp)
ltmp = sorted(ltmp, key=lambda x:(x[1],x[2]))
for j in range(len(ltmp)):
# tmp['entities'].append(["".format(str(j)), ltmp[j][0], ltmp[j][1], ltmp[j][2], ltmp[j][3]])
tmp['entities'].append({"id":j, "start_offset":ltmp[j][1], "end_offset":ltmp[j][2], "label":ltmp[j][0]})
else:
tmp['entities'] = []
tmp['id'] = i
tmp['text'] = text
result.append(tmp)
tmp = {}
tmp['id'] = 0
tmp['text'] = ''
tmp['relations'] = []
tmp['entities'] = []
i += 1
with open(data_path, 'w', encoding='utf-8') as fp:
fp.write("\n".join([json.dumps(i, ensure_ascii=False) for i in result]))
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train", './data/mid_data', "train")
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev", './data/mid_data', "dev")
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.test", './data/mid_data', "test")
\ No newline at end of file
# 数据生成step2
import os
import time
import argparse
import json
from decimal import Decimal
import numpy as np
from bert4torch.snippets import seed_everything
from utils import convert_ext_examples, convert_cls_examples, logger
def do_convert():
seed_everything(args.seed)
tic_time = time.time()
if not os.path.exists(args.doccano_file):
raise ValueError("Please input the correct path of doccano file.")
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
if len(args.splits) != 0 and len(args.splits) != 3:
raise ValueError("Only []/ len(splits)==3 accepted for splits.")
def _check_sum(splits):
return Decimal(str(splits[0])) + Decimal(str(splits[1])) + Decimal(
str(splits[2])) == Decimal("1")
if len(args.splits) == 3 and not _check_sum(args.splits):
raise ValueError(
"Please set correct splits, sum of elements in splits should be equal to 1."
)
with open(args.doccano_file, "r", encoding="utf-8") as f:
raw_examples = f.readlines()
def _create_ext_examples(examples,
negative_ratio=0,
shuffle=False,
is_train=True):
entities, relations = convert_ext_examples(
examples, negative_ratio, is_train=is_train)
examples = entities + relations
if shuffle:
indexes = np.random.permutation(len(examples))
examples = [examples[i] for i in indexes]
return examples
def _create_cls_examples(examples, prompt_prefix, options, shuffle=False):
examples = convert_cls_examples(examples, prompt_prefix, options)
if shuffle:
indexes = np.random.permutation(len(examples))
examples = [examples[i] for i in indexes]
return examples
def _save_examples(save_dir, file_name, examples):
count = 0
save_path = os.path.join(save_dir, file_name)
if not examples:
logger.info("Skip saving %d examples to %s." % (0, save_path))
return
with open(save_path, "w", encoding="utf-8") as f:
for example in examples:
f.write(json.dumps(example, ensure_ascii=False) + "\n")
count += 1
logger.info("Save %d examples to %s." % (count, save_path))
if len(args.splits) == 0:
if args.task_type == "ext":
examples = _create_ext_examples(raw_examples, args.negative_ratio,
args.is_shuffle)
else:
examples = _create_cls_examples(raw_examples, args.prompt_prefix,
args.options, args.is_shuffle)
_save_examples(args.save_dir, "train.txt", examples)
else:
if args.is_shuffle:
indexes = np.random.permutation(len(raw_examples))
raw_examples = [raw_examples[i] for i in indexes]
i1, i2, _ = args.splits
p1 = int(len(raw_examples) * i1)
p2 = int(len(raw_examples) * (i1 + i2))
if args.task_type == "ext":
train_examples = _create_ext_examples(
raw_examples[:p1], args.negative_ratio, args.is_shuffle)
dev_examples = _create_ext_examples(
raw_examples[p1:p2], -1, is_train=False)
test_examples = _create_ext_examples(
raw_examples[p2:], -1, is_train=False)
else:
train_examples = _create_cls_examples(
raw_examples[:p1], args.prompt_prefix, args.options)
dev_examples = _create_cls_examples(
raw_examples[p1:p2], args.prompt_prefix, args.options)
test_examples = _create_cls_examples(
raw_examples[p2:], args.prompt_prefix, args.options)
_save_examples(args.save_dir, "train.txt", train_examples)
_save_examples(args.save_dir, "dev.txt", dev_examples)
_save_examples(args.save_dir, "test.txt", test_examples)
logger.info('Finished! It takes %.2f seconds' % (time.time() - tic_time))
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--doccano_file", default="./data/doccano.json",
type=str, help="The doccano file exported from doccano platform.")
parser.add_argument("-s", "--save_dir", default="./data",
type=str, help="The path of data that you wanna save.")
parser.add_argument("--negative_ratio", default=5, type=int,
help="Used only for the extraction task, the ratio of positive and negative samples, number of negtive samples = negative_ratio * number of positive samples")
parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*",
help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60%% samples used for training, 20%% for evaluation and 20%% for test.")
parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str,
help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")
parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+",
help="Used only for the classification task, the options for classification")
parser.add_argument("--prompt_prefix", default="情感倾向", type=str,
help="Used only for the classification task, the prompt prefix for classification")
parser.add_argument("--is_shuffle", default=True, type=bool,
help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--seed", type=int, default=1000,
help="random seed for initialization")
args = parser.parse_args()
do_convert()
import torch
from torch.utils.data import DataLoader
from model import uie_model, tokenizer
from bert4torch.snippets import seed_everything, sequence_padding, Callback
from torch import nn
from torch.utils.data import Dataset
import numpy as np
import json
from utils import get_bool_ids_greater_than, get_span
from random import sample
batch_size = 16
learning_rate = 1e-5
train_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/train.txt'
dev_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/dev.txt'
save_dir = './'
max_seq_len = 256
num_epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
uie_model.to(device)
class IEDataset(Dataset):
"""信息抽取
"""
def __init__(self, file_path, tokenizer, max_seq_len, fewshot=None) -> None:
super().__init__()
self.file_path = file_path
if fewshot is None:
self.dataset = list(self.reader(file_path))
else:
assert isinstance(fewshot, int)
self.dataset = sample(list(self.reader(file_path)), fewshot)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
return self.dataset[index]
@staticmethod
def reader(data_path, max_seq_len=512):
"""read json
"""
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
json_line = json.loads(line)
content = json_line['content']
prompt = json_line['prompt']
# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
# It include three summary tokens.
if max_seq_len <= len(prompt) + 3:
raise ValueError("The value of max_seq_len is too small, please set a larger value")
max_content_len = max_seq_len - len(prompt) - 3
if len(content) <= max_content_len:
yield json_line
else:
result_list = json_line['result_list']
json_lines = []
accumulate = 0
while True:
cur_result_list = []
for result in result_list:
if result['start'] + 1 <= max_content_len < result['end']:
max_content_len = result['start']
break
cur_content = content[:max_content_len]
res_content = content[max_content_len:]
while True:
if len(result_list) == 0:
break
elif result_list[0]['end'] <= max_content_len:
if result_list[0]['end'] > 0:
cur_result = result_list.pop(0)
cur_result_list.append(cur_result)
else:
cur_result_list = [result for result in result_list]
break
else:
break
json_line = {'content': cur_content, 'result_list': cur_result_list, 'prompt': prompt}
json_lines.append(json_line)
for result in result_list:
if result['end'] <= 0:
break
result['start'] -= max_content_len
result['end'] -= max_content_len
accumulate += max_content_len
max_content_len = max_seq_len - len(prompt) - 3
if len(res_content) == 0:
break
elif len(res_content) < max_content_len:
json_line = {'content': res_content, 'result_list': result_list, 'prompt': prompt}
json_lines.append(json_line)
break
else:
content = res_content
for json_line in json_lines:
yield json_line
def collate_fn(batch):
"""example: {title, prompt, content, result_list}
"""
batch_token_ids, batch_token_type_ids, batch_start_ids, batch_end_ids = [], [], [], []
for example in batch:
token_ids, token_type_ids, offset_mapping = tokenizer.encode(example["prompt"], example["content"],
maxlen=max_seq_len, return_offsets='transformers')
bias = 0
for index in range(len(offset_mapping)):
if index == 0:
continue
mapping = offset_mapping[index]
if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
bias = index
if mapping[0] == 0 and mapping[1] == 0:
continue
offset_mapping[index][0] += bias
offset_mapping[index][1] += bias
start_ids = [0 for _ in range(len(token_ids))]
end_ids = [0 for _ in range(len(token_ids))]
for item in example["result_list"]:
start = map_offset(item["start"] + bias, offset_mapping)
end = map_offset(item["end"] - 1 + bias, offset_mapping)
start_ids[start] = 1.0
end_ids[end] = 1.0
batch_token_ids.append(token_ids)
batch_token_type_ids.append(token_type_ids)
batch_start_ids.append(start_ids)
batch_end_ids.append(end_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_token_type_ids = torch.tensor(sequence_padding(batch_token_type_ids), dtype=torch.long, device=device)
batch_start_ids = torch.tensor(sequence_padding(batch_start_ids), dtype=torch.float, device=device)
batch_end_ids = torch.tensor(sequence_padding(batch_end_ids), dtype=torch.float, device=device)
return [batch_token_ids, batch_token_type_ids], [batch_start_ids, batch_end_ids]
def map_offset(ori_offset, offset_mapping):
"""map ori offset to token offset
"""
for index, span in enumerate(offset_mapping):
if span[0] <= ori_offset < span[1]:
return index
return -1
# 数据准备
train_ds = IEDataset(train_path, tokenizer=tokenizer, max_seq_len=max_seq_len, fewshot=None)
dev_ds = IEDataset(dev_path, tokenizer=tokenizer, max_seq_len=max_seq_len)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(dev_ds, batch_size=batch_size, collate_fn=collate_fn)
class MyLoss(nn.Module):
def forward(self, y_pred, y_true):
start_prob, end_prob = y_pred
start_ids, end_ids = y_true
loss_start = torch.nn.functional.binary_cross_entropy(start_prob, start_ids)
loss_end = torch.nn.functional.binary_cross_entropy(end_prob, end_ids)
return loss_start + loss_end
uie_model.compile(
loss=MyLoss(),
optimizer=torch.optim.AdamW(lr=learning_rate, params=uie_model.parameters()),
)
class SpanEvaluator(Callback):
"""SpanEvaluator computes the precision, recall and F1-score for span detection.
"""
def __init__(self):
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
self.best_val_f1 = 0
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val-entity level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
def evaluate(self, dataloder):
self.reset()
for x_true, y_true in dataloder:
start_prob, end_prob = uie_model.predict(*x_true)
start_ids, end_ids = y_true
num_correct, num_infer, num_label = self.compute(start_prob, end_prob, start_ids, end_ids)
self.update(num_correct, num_infer, num_label)
precision, recall, f1 = self.accumulate()
return f1, precision, recall
def compute(self, start_probs, end_probs, gold_start_ids, gold_end_ids):
"""Computes the precision, recall and F1-score for span detection.
"""
start_probs = start_probs.cpu().numpy()
end_probs = end_probs.cpu().numpy()
gold_start_ids = gold_start_ids.cpu().numpy()
gold_end_ids = gold_end_ids.cpu().numpy()
pred_start_ids = get_bool_ids_greater_than(start_probs)
pred_end_ids = get_bool_ids_greater_than(end_probs)
gold_start_ids = get_bool_ids_greater_than(gold_start_ids.tolist())
gold_end_ids = get_bool_ids_greater_than(gold_end_ids.tolist())
num_correct_spans = 0
num_infer_spans = 0
num_label_spans = 0
for predict_start_ids, predict_end_ids, label_start_ids, label_end_ids in zip(
pred_start_ids, pred_end_ids, gold_start_ids, gold_end_ids):
[_correct, _infer, _label] = self.eval_span(predict_start_ids, predict_end_ids, label_start_ids, label_end_ids)
num_correct_spans += _correct
num_infer_spans += _infer
num_label_spans += _label
return num_correct_spans, num_infer_spans, num_label_spans
def update(self, num_correct_spans, num_infer_spans, num_label_spans):
"""
This function takes (num_infer_spans, num_label_spans, num_correct_spans) as input,
to accumulate and update the corresponding status of the SpanEvaluator object.
"""
self.num_infer_spans += num_infer_spans
self.num_label_spans += num_label_spans
self.num_correct_spans += num_correct_spans
def eval_span(self, predict_start_ids, predict_end_ids, label_start_ids,
label_end_ids):
"""
evaluate position extraction (start, end)
return num_correct, num_infer, num_label
input: [1, 2, 10] [4, 12] [2, 10] [4, 11]
output: (1, 2, 2)
"""
pred_set = get_span(predict_start_ids, predict_end_ids)
label_set = get_span(label_start_ids, label_end_ids)
num_correct = len(pred_set & label_set)
num_infer = len(pred_set)
num_label = len(label_set)
return (num_correct, num_infer, num_label)
def accumulate(self):
"""
This function returns the mean precision, recall and f1 score for all accumulated minibatches.
Returns:
tuple: Returns tuple (`precision, recall, f1 score`).
"""
precision = float(self.num_correct_spans / self.num_infer_spans) if self.num_infer_spans else 0.
recall = float(self.num_correct_spans / self.num_label_spans) if self.num_label_spans else 0.
f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_spans else 0.
return precision, recall, f1_score
def reset(self):
"""
Reset function empties the evaluation memory for previous mini-batches.
"""
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
if __name__ == "__main__":
evaluator = SpanEvaluator()
print('zero_shot performance: ', evaluator.evaluate(valid_dataloader))
uie_model.fit(train_dataloader, epochs=num_epochs, steps_per_epoch=None, callbacks=[evaluator])
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.losses import FocalLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel, BERT
from tqdm import tqdm
config_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class UIE(BERT):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hidden_size = self.hidden_size
self.linear_start = nn.Linear(hidden_size, 1)
self.linear_end = nn.Linear(hidden_size, 1)
self.sigmoid = nn.Sigmoid()
if kwargs.get('use_task_id') and kwargs.get('use_task_id'):
# Add task type embedding to BERT
task_type_embeddings = nn.Embedding(kwargs.get('task_type_vocab_size'), self.hidden_size)
self.embeddings.task_type_embeddings = task_type_embeddings
def hook(module, input, output):
return output+task_type_embeddings(torch.zeros(input[0].size(), dtype=torch.int64, device=input[0].device))
self.embeddings.word_embeddings.register_forward_hook(hook)
def forward(self, token_ids, token_type_ids):
outputs = super().forward([token_ids, token_type_ids])
sequence_output = outputs[0]
start_logits = self.linear_start(sequence_output)
start_logits = torch.squeeze(start_logits, -1)
start_prob = self.sigmoid(start_logits)
end_logits = self.linear_end(sequence_output)
end_logits = torch.squeeze(end_logits, -1)
end_prob = self.sigmoid(end_logits)
return start_prob, end_prob
@torch.no_grad()
def predict(self, token_ids, token_type_ids):
self.eval()
start_prob, end_prob = self.forward(token_ids, token_type_ids)
return start_prob, end_prob
uie_model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=UIE, with_pool=True)
\ No newline at end of file
import numpy as np
import math
import torch
from bert4torch.snippets import sequence_padding
from utils import get_bool_ids_greater_than, get_span, get_id_and_prob, cut_chinese_sent, dbc2sbc
from pprint import pprint
import torch.nn.functional as F
class UIEPredictor(object):
def __init__(self, schema, device='cpu', position_prob=0.5, max_seq_len=512, batch_size=64, split_sentence=False):
self._device = device
self._position_prob = position_prob
self._max_seq_len = max_seq_len
self._batch_size = 64
self._split_sentence = False
self._schema_tree = None
self.set_schema(schema)
from model import uie_model, tokenizer
self._tokenizer = tokenizer
self.model = uie_model.to(self._device)
def set_schema(self, schema):
if isinstance(schema, dict) or isinstance(schema, str):
schema = [schema]
self._schema_tree = self._build_tree(schema)
def __call__(self, inputs):
texts = inputs
texts = [texts] if isinstance(texts, str) else texts
results = self._multi_stage_predict(texts)
return results
def _multi_stage_predict(self, datas):
"""构建schema tree和预测
"""
results = [{} for _ in range(len(datas))]
# input check to early return
if len(datas) < 1 or self._schema_tree is None:
return results
# copy to stay `self._schema_tree` unchanged
schema_list = self._schema_tree.children[:]
while len(schema_list) > 0:
node = schema_list.pop(0)
examples = []
input_map = {}
cnt = 0
idx = 0
if not node.prefix:
for data in datas:
examples.append({"text": data, "prompt": dbc2sbc(node.name)})
input_map[cnt] = [idx]
idx += 1
cnt += 1
else:
for pre, data in zip(node.prefix, datas):
if len(pre) == 0:
input_map[cnt] = []
else:
for p in pre:
examples.append({ "text": data, "prompt": dbc2sbc(p + node.name)})
input_map[cnt] = [i + idx for i in range(len(pre))]
idx += len(pre)
cnt += 1
if len(examples) == 0:
result_list = []
else:
result_list = self._single_stage_predict(examples)
if not node.parent_relations:
relations = [[] for i in range(len(datas))]
for k, v in input_map.items():
for idx in v:
if len(result_list[idx]) == 0:
continue
if node.name not in results[k].keys():
results[k][node.name] = result_list[idx]
else:
results[k][node.name].extend(result_list[idx])
if node.name in results[k].keys():
relations[k].extend(results[k][node.name])
else:
relations = node.parent_relations
for k, v in input_map.items():
for i in range(len(v)):
if len(result_list[v[i]]) == 0:
continue
if "relations" not in relations[k][i].keys():
relations[k][i]["relations"] = {
node.name: result_list[v[i]]
}
elif node.name not in relations[k][i]["relations"].keys(
):
relations[k][i]["relations"][
node.name] = result_list[v[i]]
else:
relations[k][i]["relations"][node.name].extend(
result_list[v[i]])
new_relations = [[] for i in range(len(datas))]
for i in range(len(relations)):
for j in range(len(relations[i])):
if "relations" in relations[i][j].keys(
) and node.name in relations[i][j]["relations"].keys():
for k in range(
len(relations[i][j]["relations"][
node.name])):
new_relations[i].append(relations[i][j][
"relations"][node.name][k])
relations = new_relations
prefix = [[] for _ in range(len(datas))]
for k, v in input_map.items():
for idx in v:
for i in range(len(result_list[idx])):
prefix[k].append(result_list[idx][i]["text"] + "的")
for child in node.children:
child.prefix = prefix
child.parent_relations = relations
schema_list.append(child)
return results
def _convert_ids_to_results(self, examples, sentence_ids, probs):
"""
Convert ids to raw text in a single stage.
"""
results = []
for example, sentence_id, prob in zip(examples, sentence_ids, probs):
if len(sentence_id) == 0:
results.append([])
continue
result_list = []
text = example["text"]
prompt = example["prompt"]
for i in range(len(sentence_id)):
start, end = sentence_id[i]
if start < 0 and end >= 0:
continue
if end < 0:
start += (len(prompt) + 1)
end += (len(prompt) + 1)
result = {"text": prompt[start:end],
"probability": prob[i]}
result_list.append(result)
else:
result = {
"text": text[start:end],
"start": start,
"end": end,
"probability": prob[i]
}
result_list.append(result)
results.append(result_list)
return results
def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
'''
Split the raw texts automatically for model inference.
Args:
input_texts (List[str]): input raw texts.
max_text_len (int): cutting length.
split_sentence (bool): If True, sentence-level split will be performed.
return:
short_input_texts (List[str]): the short input texts for model inference.
input_mapping (dict): mapping between raw text and short input texts.
'''
input_mapping = {}
short_input_texts = []
cnt_org = 0
cnt_short = 0
for text in input_texts:
if not split_sentence:
sens = [text]
else:
sens = cut_chinese_sent(text)
for sen in sens:
lens = len(sen)
if lens <= max_text_len:
short_input_texts.append(sen)
if cnt_org not in input_mapping.keys():
input_mapping[cnt_org] = [cnt_short]
else:
input_mapping[cnt_org].append(cnt_short)
cnt_short += 1
else:
temp_text_list = [sen[i:i + max_text_len] for i in range(0, lens, max_text_len)]
short_input_texts.extend(temp_text_list)
short_idx = cnt_short
cnt_short += math.ceil(lens / max_text_len)
temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
if cnt_org not in input_mapping.keys():
input_mapping[cnt_org] = temp_text_id
else:
input_mapping[cnt_org].extend(temp_text_id)
cnt_org += 1
return short_input_texts, input_mapping
def _single_stage_predict(self, inputs):
input_texts = []
prompts = []
for i in range(len(inputs)):
input_texts.append(inputs[i]["text"])
prompts.append(inputs[i]["prompt"])
# max predict length should exclude the length of prompt and summary tokens
max_predict_len = self._max_seq_len - len(max(prompts)) - 3
short_input_texts, self.input_mapping = self._auto_splitter(input_texts, max_predict_len, split_sentence=self._split_sentence)
short_texts_prompts = []
for k, v in self.input_mapping.items():
short_texts_prompts.extend([prompts[k] for i in range(len(v))])
short_inputs = [{"text": short_input_texts[i], "prompt": short_texts_prompts[i]} for i in range(len(short_input_texts))]
token_ids, segment_ids, offset_maps = self._tokenizer.encode(short_texts_prompts, short_input_texts, maxlen=self._max_seq_len, return_offsets='transformers')
start_prob_concat, end_prob_concat = [], []
for batch_start in range(0, len(short_input_texts), self._batch_size):
batch_token_ids = token_ids[batch_start:batch_start+self._batch_size]
batch_segment_ids = segment_ids[batch_start:batch_start+self._batch_size]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=self._device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=self._device)
start_prob, end_prob = self.model.predict(batch_token_ids, batch_segment_ids)
start_prob_concat.append(start_prob.cpu().numpy())
end_prob_concat.append(end_prob.cpu().numpy())
start_prob_concat = np.concatenate(start_prob_concat)
end_prob_concat = np.concatenate(end_prob_concat)
start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=self._position_prob, return_prob=True)
end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=self._position_prob, return_prob=True)
sentence_ids = []
probs = []
for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, token_ids, offset_maps):
for i in reversed(range(len(ids))):
if ids[i] != 0:
ids = ids[:i]
break
span_list = get_span(start_ids, end_ids, with_prob=True)
sentence_id, prob = get_id_and_prob(span_list, offset_map)
sentence_ids.append(sentence_id)
probs.append(prob)
results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
results = self._auto_joiner(results, short_input_texts, self.input_mapping)
return results
def _auto_joiner(self, short_results, short_inputs, input_mapping):
concat_results = []
is_cls_task = False
for short_result in short_results:
if short_result == []:
continue
elif 'start' not in short_result[0].keys(
) and 'end' not in short_result[0].keys():
is_cls_task = True
break
else:
break
for k, vs in input_mapping.items():
if is_cls_task:
cls_options = {}
single_results = []
for v in vs:
if len(short_results[v]) == 0:
continue
if short_results[v][0]['text'] not in cls_options.keys():
cls_options[short_results[v][0][
'text']] = [1, short_results[v][0]['probability']]
else:
cls_options[short_results[v][0]['text']][0] += 1
cls_options[short_results[v][0]['text']][
1] += short_results[v][0]['probability']
if len(cls_options) != 0:
cls_res, cls_info = max(cls_options.items(),
key=lambda x: x[1])
concat_results.append([{
'text': cls_res,
'probability': cls_info[1] / cls_info[0]
}])
else:
concat_results.append([])
else:
offset = 0
single_results = []
for v in vs:
if v == 0:
single_results = short_results[v]
offset += len(short_inputs[v])
else:
for i in range(len(short_results[v])):
if 'start' not in short_results[v][
i] or 'end' not in short_results[v][i]:
continue
short_results[v][i]['start'] += offset
short_results[v][i]['end'] += offset
offset += len(short_inputs[v])
single_results.extend(short_results[v])
concat_results.append(single_results)
return concat_results
def predict(self, input_data):
results = self._multi_stage_predict(input_data)
return results
@classmethod
def _build_tree(cls, schema, name='root'):
"""
Build the schema tree.
"""
schema_tree = SchemaTree(name)
for s in schema:
if isinstance(s, str):
schema_tree.add_child(SchemaTree(s))
elif isinstance(s, dict):
for k, v in s.items():
if isinstance(v, str):
child = [v]
elif isinstance(v, list):
child = v
else:
raise TypeError("Invalid schema, value for each key:value pairs should be list or string but {} received".format(type(v)))
schema_tree.add_child(cls._build_tree(child, name=k))
else:
raise TypeError("Invalid schema, element should be string or dict, but {} received".format(type(s)))
return schema_tree
class SchemaTree(object):
"""SchemaTree的实现
"""
def __init__(self, name='root', children=None):
self.name = name
self.children = []
self.prefix = None
self.parent_relations = None
if children is not None:
for child in children:
self.add_child(child)
def __repr__(self):
return self.name
def add_child(self, node):
assert isinstance(node, SchemaTree), "The children of a node should be an instacne of SchemaTree."
self.children.append(node)
if __name__ == '__main__':
# 命名实体识别
schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
ie = UIEPredictor(schema=schema)
pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!"))
schema = ['肿瘤的大小', '肿瘤的个数', '肝癌级别', '脉管内癌栓分级']
ie.set_schema(schema)
pprint(ie("(右肝肿瘤)肝细胞性肝癌(II-III级,梁索型和假腺管型),肿瘤包膜不完整,紧邻肝被膜,侵及周围肝组织,未见脉管内癌栓(MVI分级:M0级)及卫星子灶形成。(肿物1个,大小4.2×4.0×2.8cm)。"))
# 关系抽取
schema = {'竞赛名称': ['主办方', '承办方', '已举办次数']}
ie.set_schema(schema) # Reset schema
pprint(ie('2022语言与智能技术竞赛由中国中文信息学会和中国计算机学会联合主办,百度公司、中国中文信息学会评测工作委员会和中国计算机学会自然语言处理专委会承办,已连续举办4届,成为全球最热门的中文NLP赛事之一。'))
# 事件抽取
schema = {'地震触发词': ['地震强度', '时间', '震中位置', '震源深度']}
ie.set_schema(schema) # Reset schema
ie('中国地震台网正式测定:5月16日06时08分在云南临沧市凤庆县(北纬24.34度,东经99.98度)发生3.5级地震,震源深度10千米。')
# 评论观点抽取
schema = {'评价维度': ['观点词', '情感倾向[正向,负向]']}
ie.set_schema(schema) # Reset schema
pprint(ie("店面干净,很清静,服务员服务热情,性价比很高,发现收银台有排队"))
# 情感倾向分类
schema = '情感倾向[正向,负向]'
ie.set_schema(schema)
ie('这个产品用起来真的很流畅,我非常喜欢')
\ No newline at end of file
import contextlib
import functools
import json
import logging
import math
import random
import re
import shutil
import threading
import time
from functools import partial
import colorlog
import numpy as np
import torch
from colorama import Back, Fore
from tqdm import tqdm
loggers = {}
log_config = {
'DEBUG': {'level': 10, 'color': 'purple'},
'INFO': {'level': 20, 'color': 'green'},
'TRAIN': {'level': 21, 'color': 'cyan'},
'EVAL': {'level': 22, 'color': 'blue'},
'WARNING': {'level': 30, 'color': 'yellow'},
'ERROR': {'level': 40, 'color': 'red'},
'CRITICAL': {'level': 50, 'color': 'bold_red'}
}
def get_span(start_ids, end_ids, with_prob=False):
"""
Get span set from position start and end list.
Args:
start_ids (List[int]/List[tuple]): The start index list.
end_ids (List[int]/List[tuple]): The end index list.
with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
Returns:
set: The span set without overlapping, every id can only be used once .
"""
if with_prob:
start_ids = sorted(start_ids, key=lambda x: x[0])
end_ids = sorted(end_ids, key=lambda x: x[0])
else:
start_ids = sorted(start_ids)
end_ids = sorted(end_ids)
start_pointer = 0
end_pointer = 0
len_start = len(start_ids)
len_end = len(end_ids)
couple_dict = {}
while start_pointer < len_start and end_pointer < len_end:
if with_prob:
start_id = start_ids[start_pointer][0]
end_id = end_ids[end_pointer][0]
else:
start_id = start_ids[start_pointer]
end_id = end_ids[end_pointer]
if start_id == end_id:
couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
start_pointer += 1
end_pointer += 1
continue
if start_id < end_id:
couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
start_pointer += 1
continue
if start_id > end_id:
end_pointer += 1
continue
result = [(couple_dict[end], end) for end in couple_dict]
result = set(result)
return result
def get_bool_ids_greater_than(probs, limit=0.5, return_prob=False):
"""
Get idx of the last dimension in probability arrays, which is greater than a limitation.
Args:
probs (List[List[float]]): The input probability arrays.
limit (float): The limitation for probability.
return_prob (bool): Whether to return the probability
Returns:
List[List[int]]: The index of the last dimension meet the conditions.
"""
probs = np.array(probs)
dim_len = len(probs.shape)
if dim_len > 1:
result = []
for p in probs:
result.append(get_bool_ids_greater_than(p, limit, return_prob))
return result
else:
result = []
for i, p in enumerate(probs):
if p > limit:
if return_prob:
result.append((i, p))
else:
result.append(i)
return result
class Logger(object):
'''
Deafult logger in UIE
Args:
name(str) : Logger name, default is 'UIE'
'''
def __init__(self, name: str = None):
name = 'UIE' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
logging.addLevelName(conf['level'], key)
self.__dict__[key] = functools.partial(
self.__call__, conf['level'])
self.__dict__[key.lower()] = functools.partial(
self.__call__, conf['level'])
self.format = colorlog.ColoredFormatter(
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
log_colors={key: conf['color']
for key, conf in log_config.items()})
self.handler = logging.StreamHandler()
self.handler.setFormatter(self.format)
self.logger.addHandler(self.handler)
self.logLevel = 'DEBUG'
self.logger.setLevel(logging.DEBUG)
self.logger.propagate = False
self._is_enable = True
def disable(self):
self._is_enable = False
def enable(self):
self._is_enable = True
@property
def is_enable(self) -> bool:
return self._is_enable
def __call__(self, log_level: str, msg: str):
if not self.is_enable:
return
self.logger.log(log_level, msg)
@contextlib.contextmanager
def use_terminator(self, terminator: str):
old_terminator = self.handler.terminator
self.handler.terminator = terminator
yield
self.handler.terminator = old_terminator
@contextlib.contextmanager
def processing(self, msg: str, interval: float = 0.1):
'''
Continuously print a progress bar with rotating special effects.
Args:
msg(str): Message to be printed.
interval(float): Rotation interval. Default to 0.1.
'''
end = False
def _printer():
index = 0
flags = ['\\', '|', '/', '-']
while not end:
flag = flags[index % len(flags)]
with self.use_terminator('\r'):
self.info('{}: {}'.format(msg, flag))
time.sleep(interval)
index += 1
t = threading.Thread(target=_printer)
t.start()
yield
end = True
logger = Logger()
BAR_FORMAT = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET} {Fore.GREEN}{{n_fmt}}/{{total_fmt}} {Fore.RED}{{rate_fmt}}{{postfix}}{Fore.RESET} eta {Fore.CYAN}{{remaining}}{Fore.RESET}'
BAR_FORMAT_NO_TIME = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET} {Fore.GREEN}{{n_fmt}}/{{total_fmt}}{Fore.RESET}'
BAR_TYPE = [
"░▝▗▖▘▚▞▛▙█",
"░▖▘▝▗▚▞█",
" ▖▘▝▗▚▞█",
"░▒█",
" >=",
" ▏▎▍▌▋▊▉█"
"░▏▎▍▌▋▊▉█"
]
tqdm = partial(tqdm, bar_format=BAR_FORMAT, ascii=BAR_TYPE[0], leave=False)
def get_id_and_prob(spans, offset_map):
prompt_length = 0
for i in range(1, len(offset_map)):
if offset_map[i] != [0, 0]:
prompt_length += 1
else:
break
for i in range(1, prompt_length + 1):
offset_map[i][0] -= (prompt_length + 1)
offset_map[i][1] -= (prompt_length + 1)
sentence_id = []
prob = []
for start, end in spans:
prob.append(start[1] * end[1])
sentence_id.append(
(offset_map[start[0]][0], offset_map[end[0]][1]))
return sentence_id, prob
def cut_chinese_sent(para):
"""
Cut the Chinese sentences more precisely, reference to
"https://blog.csdn.net/blmoistawinde/article/details/82379256".
"""
para = re.sub(r'([。!?\?])([^”’])', r'\1\n\2', para)
para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
para = re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip()
return para.split("\n")
def dbc2sbc(s):
rs = ""
for char in s:
code = ord(char)
if code == 0x3000:
code = 0x0020
else:
code -= 0xfee0
if not (0x0021 <= code and code <= 0x7e):
rs += char
continue
rs += chr(code)
return rs
def convert_cls_examples(raw_examples, prompt_prefix, options):
examples = []
logger.info(f"Converting doccano data...")
with tqdm(total=len(raw_examples)) as pbar:
for line in raw_examples:
items = json.loads(line)
# Compatible with doccano >= 1.6.2
if "data" in items.keys():
text, labels = items["data"], items["label"]
else:
text, labels = items["text"], items["label"]
random.shuffle(options)
prompt = ""
sep = ","
for option in options:
prompt += option
prompt += sep
prompt = prompt_prefix + "[" + prompt.rstrip(sep) + "]"
result_list = []
example = {
"content": text,
"result_list": result_list,
"prompt": prompt
}
for label in labels:
start = prompt.rfind(label[0]) - len(prompt) - 1
end = start + len(label)
result = {"text": label, "start": start, "end": end}
example["result_list"].append(result)
examples.append(example)
return examples
def add_negative_example(examples, texts, prompts, label_set, negative_ratio):
negative_examples = []
positive_examples = []
with tqdm(total=len(prompts)) as pbar:
for i, prompt in enumerate(prompts):
negative_sample = []
redundants_list = list(set(label_set) ^ set(prompt))
redundants_list.sort()
num_positive = len(examples[i])
if num_positive != 0:
actual_ratio = math.ceil(len(redundants_list) / num_positive)
else:
# Set num_positive to 1 for text without positive example
num_positive, actual_ratio = 1, 0
if actual_ratio <= negative_ratio or negative_ratio == -1:
idxs = [k for k in range(len(redundants_list))]
else:
idxs = random.sample(
range(0, len(redundants_list)),
negative_ratio * num_positive)
for idx in idxs:
negative_result = {
"content": texts[i],
"result_list": [],
"prompt": redundants_list[idx]
}
negative_examples.append(negative_result)
positive_examples.extend(examples[i])
pbar.update(1)
return positive_examples, negative_examples
def add_full_negative_example(examples, texts, relation_prompts, predicate_set,
subject_goldens):
with tqdm(total=len(relation_prompts)) as pbar:
for i, relation_prompt in enumerate(relation_prompts):
negative_sample = []
for subject in subject_goldens[i]:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = subject + "的" + predicate
if prompt not in relation_prompt:
negative_result = {
"content": texts[i],
"result_list": [],
"prompt": prompt
}
negative_sample.append(negative_result)
examples[i].extend(negative_sample)
pbar.update(1)
return examples
def construct_relation_prompt_set(entity_name_set, predicate_set):
relation_prompt_set = set()
for entity_name in entity_name_set:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
relation_prompt = entity_name + "的" + predicate
relation_prompt_set.add(relation_prompt)
return sorted(list(relation_prompt_set))
def convert_ext_examples(raw_examples, negative_ratio, is_train=True):
texts = []
entity_examples = []
relation_examples = []
entity_prompts = []
relation_prompts = []
entity_label_set = []
entity_name_set = []
predicate_set = []
subject_goldens = []
logger.info(f"Converting doccano data...")
with tqdm(total=len(raw_examples)) as pbar:
for line in raw_examples:
items = json.loads(line)
entity_id = 0
if "data" in items.keys():
relation_mode = False
if isinstance(items["label"],
dict) and "entities" in items["label"].keys():
relation_mode = True
text = items["data"]
entities = []
if not relation_mode:
# Export file in JSONL format which doccano < 1.7.0
for item in items["label"]:
entity = {
"id": entity_id,
"start_offset": item[0],
"end_offset": item[1],
"label": item[2]
}
entities.append(entity)
entity_id += 1
else:
# Export file in JSONL format for relation labeling task which doccano < 1.7.0
for item in items["label"]["entities"]:
entity = {
"id": entity_id,
"start_offset": item["start_offset"],
"end_offset": item["end_offset"],
"label": item["label"]
}
entities.append(entity)
entity_id += 1
relations = []
else:
# Export file in JSONL format which doccano >= 1.7.0
if "label" in items.keys():
text = items["text"]
entities = []
for item in items["label"]:
entity = {
"id": entity_id,
"start_offset": item[0],
"end_offset": item[1],
"label": item[2]
}
entities.append(entity)
entity_id += 1
relations = []
else:
# Export file in JSONL (relation) format
text, relations, entities = items["text"], items[
"relations"], items["entities"]
texts.append(text)
entity_example = []
entity_prompt = []
entity_example_map = {}
entity_map = {} # id to entity name
for entity in entities:
entity_name = text[entity["start_offset"]:entity["end_offset"]]
entity_map[entity["id"]] = {
"name": entity_name,
"start": entity["start_offset"],
"end": entity["end_offset"]
}
entity_label = entity["label"]
result = {
"text": entity_name,
"start": entity["start_offset"],
"end": entity["end_offset"]
}
if entity_label not in entity_example_map.keys():
entity_example_map[entity_label] = {
"content": text,
"result_list": [result],
"prompt": entity_label
}
else:
entity_example_map[entity_label]["result_list"].append(
result)
if entity_label not in entity_label_set:
entity_label_set.append(entity_label)
if entity_name not in entity_name_set:
entity_name_set.append(entity_name)
entity_prompt.append(entity_label)
for v in entity_example_map.values():
entity_example.append(v)
entity_examples.append(entity_example)
entity_prompts.append(entity_prompt)
subject_golden = []
relation_example = []
relation_prompt = []
relation_example_map = {}
for relation in relations:
predicate = relation["type"]
subject_id = relation["from_id"]
object_id = relation["to_id"]
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = entity_map[subject_id]["name"] + "的" + predicate
if entity_map[subject_id]["name"] not in subject_golden:
subject_golden.append(entity_map[subject_id]["name"])
result = {
"text": entity_map[object_id]["name"],
"start": entity_map[object_id]["start"],
"end": entity_map[object_id]["end"]
}
if prompt not in relation_example_map.keys():
relation_example_map[prompt] = {
"content": text,
"result_list": [result],
"prompt": prompt
}
else:
relation_example_map[prompt]["result_list"].append(result)
if predicate not in predicate_set:
predicate_set.append(predicate)
relation_prompt.append(prompt)
for v in relation_example_map.values():
relation_example.append(v)
relation_examples.append(relation_example)
relation_prompts.append(relation_prompt)
subject_goldens.append(subject_golden)
pbar.update(1)
def concat_examples(positive_examples, negative_examples, negative_ratio):
examples = []
if math.ceil(len(negative_examples) /
len(positive_examples)) <= negative_ratio:
examples = positive_examples + negative_examples
else:
# Random sampling the negative examples to ensure overall negative ratio unchanged.
idxs = random.sample(
range(0, len(negative_examples)),
negative_ratio * len(positive_examples))
negative_examples_sampled = []
for idx in idxs:
negative_examples_sampled.append(negative_examples[idx])
examples = positive_examples + negative_examples_sampled
return examples
logger.info(f"Adding negative samples for first stage prompt...")
positive_examples, negative_examples = add_negative_example(
entity_examples, texts, entity_prompts, entity_label_set,
negative_ratio)
if len(positive_examples) == 0:
all_entity_examples = []
elif is_train:
all_entity_examples = concat_examples(positive_examples,
negative_examples, negative_ratio)
else:
all_entity_examples = positive_examples + negative_examples
all_relation_examples = []
if len(predicate_set) != 0:
if is_train:
logger.info(f"Adding negative samples for second stage prompt...")
relation_prompt_set = construct_relation_prompt_set(entity_name_set,
predicate_set)
positive_examples, negative_examples = add_negative_example(
relation_examples, texts, relation_prompts, relation_prompt_set,
negative_ratio)
all_relation_examples = concat_examples(
positive_examples, negative_examples, negative_ratio)
else:
logger.info(f"Adding negative samples for second stage prompt...")
relation_examples = add_full_negative_example(
relation_examples, texts, relation_prompts, predicate_set,
subject_goldens)
all_relation_examples = [
r
for r in relation_example
for relation_example in relation_examples
]
return all_entity_examples, all_relation_examples
def get_path_from_url(url,
root_dir,
check_exist=True,
decompress=True):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url and decompress it, return the path.
Args:
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME or DATASET_HOME
decompress (bool): decompress zip or tar file. Default is `True`
Returns:
str: a local path to save downloaded models & weights & datasets.
"""
import os.path
import os
import tarfile
import zipfile
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') or path.startswith('https://')
def _map_path(url, root_dir):
# parse path after download under root_dir
fname = os.path.split(url)[-1]
fpath = fname
return os.path.join(root_dir, fpath)
def _get_download(url, fullname):
import requests
# using requests.get method
fname = os.path.basename(fullname)
try:
req = requests.get(url, stream=True)
except Exception as e: # requests.exceptions.ConnectionError
logger.info("Downloading {} from {} failed with exception {}".format(
fname, url, str(e)))
return False
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
for chunk in req.iter_content(chunk_size=1024):
f.write(chunk)
pbar.update(1)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _download(url, path):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.split(url)[-1]
fullname = os.path.join(path, fname)
retry_cnt = 0
logger.info("Downloading {} from {}".format(fname, url))
DOWNLOAD_RETRY_LIMIT = 3
while not os.path.exists(fullname):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
if not _get_download(url, fullname):
time.sleep(1)
continue
return fullname
def _uncompress_file_zip(filepath):
with zipfile.ZipFile(filepath, 'r') as files:
file_list = files.namelist()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
# `strip(os.sep)` to remove `os.sep` in the tail of path
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _is_a_single_file(file_list):
if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
return True
return False
def _is_a_single_dir(file_list):
new_file_list = []
for file_path in file_list:
if '/' in file_path:
file_path = file_path.replace('/', os.sep)
elif '\\' in file_path:
file_path = file_path.replace('\\', os.sep)
new_file_list.append(file_path)
file_name = new_file_list[0].split(os.sep)[0]
for i in range(1, len(new_file_list)):
if file_name != new_file_list[i].split(os.sep)[0]:
return False
return True
def _uncompress_file_tar(filepath, mode="r:*"):
with tarfile.open(filepath, mode) as files:
file_list = files.getnames()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _decompress(fname):
"""
Decompress for zip and tar file
"""
logger.info("Decompressing {}...".format(fname))
# For protecting decompressing interupted,
# decompress to fpath_tmp directory firstly, if decompress
# successed, move decompress files to fpath and delete
# fpath_tmp and remove download compress file.
if tarfile.is_tarfile(fname):
uncompressed_path = _uncompress_file_tar(fname)
elif zipfile.is_zipfile(fname):
uncompressed_path = _uncompress_file_zip(fname)
else:
raise TypeError("Unsupport compress file type {}".format(fname))
return uncompressed_path
assert is_url(url), "downloading from {} not a url".format(url)
fullpath = _map_path(url, root_dir)
if os.path.exists(fullpath) and check_exist:
logger.info("Found {}".format(fullpath))
else:
fullpath = _download(url, root_dir)
if decompress and (tarfile.is_tarfile(fullpath) or
zipfile.is_zipfile(fullpath)):
fullpath = _decompress(fullpath)
return fullpath
#! -*- coding: utf-8 -*-
# 利用自带的接口,将SimBERT的同义句生成搭建成Web服务。
# 基于bottlepy简单封装,仅作为临时测试使用,不保证性能。
# 具体用法请看 https://github.com/bojone/bert4keras/blob/8ffb46a16a79f87aa8cdf045df7994036b4be47d/bert4keras/snippets.py#L580
import torch
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, get_pool_emb
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import WebServing
# 基本信息
maxlen = 32
choice = 'simbert' # simbert simbert_v2
if choice == 'simbert':
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base"
args_model = 'bert'
else:
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base"
args_model = 'roformer'
# 加载simbert权重或simbert_v2
root_model_path = args_model_path
dict_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear', model=args_model,
application='unilm', keep_tokens=keep_tokens)
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
if __name__ == '__main__':
arguments = {'text': (None, True), 'n': (int, False), 'k': (int, False)}
web = WebServing(port=8864)
web.route('/gen_synonyms', gen_synonyms, arguments)
web.start()
# 现在可以测试访问 http://127.0.0.1:8864/gen_synonyms?text=苹果多少钱一斤
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment