Commit c007ba1a authored by sunzhq2's avatar sunzhq2 Committed by xuxo
Browse files

update

parents
Pipeline #3464 failed with stages
in 0 seconds
#! -*- coding: utf-8 -*-
# SimBERT预训练代码,也可用于微调,微调方式用其他方式比如sentence_bert的可能更好
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, AutoRegressiveDecoder, Callback, get_pool_emb
from bert4torch.tokenizers import Tokenizer, load_vocab
# 基本信息
maxlen = 32
batch_size = 32
# 这里加载的是simbert权重,在此基础上用自己的数据继续pretrain/finetune
# 自己从头预训练也可以直接加载bert/roberta等checkpoint
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
text, synonym = synonyms[:2]
text, synonym = truncate(text), truncate(synonym)
token_ids, segment_ids = tokenizer.encode(text, synonym, maxlen=maxlen * 2)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
token_ids, segment_ids = tokenizer.encode(synonym, text, maxlen=maxlen * 2)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear',
with_mlm='linear', application='unilm', keep_tokens=keep_tokens)
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred):
y_true = self.get_labels_of_similarity(y_pred) # 构建标签
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
similarities = similarities - torch.eye(y_pred.shape[0], device=device) * 1e12 # 排除对角线
similarities = similarities * 30 # scale
loss = F.cross_entropy(similarities, y_true)
return loss
def get_labels_of_similarity(self, y_pred):
idxs = torch.arange(0, y_pred.shape[0], device=device)
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
labels = idxs_1.eq(idxs_2).float()
return labels
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
choice = 'similarity' # train generate similarity
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
elif choice == 'generate':
print(gen_synonyms('我想去北京玩玩可以吗', 10, 10))
elif choice == 'similarity':
target_text = '我想去首都北京玩玩'
text_list = ['我想去北京玩', '北京有啥好玩的吗?我想去看看', '好渴望去北京游玩啊']
Z = cal_sen_emb([target_text]+text_list)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
similarity = torch.matmul(Z[1:], Z[0])
for i, line in enumerate(text_list):
print(f'cos_sim: {similarity[i].item():.4f}, tgt_text: "{target_text}", cal_text: "{line}"')
else:
model.load_weights('./best_model.pt')
maxtime=`cat result_*.log |grep "^total_inf" |awk 'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'`
#inftime=`cat result_*.log |grep "^total_inf" |awk '{s+=$2}END{print s, s/NR}'`
inffps=`cat result_*.log |grep "^avg_infer_fps" |awk '{s+=$2}END{print s, s/NR}'`
loadtime=`cat result_*.log |grep "^load_data_total" |awk 'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'`
loadfps=`cat result_*.log |grep "^load_data_avg" |awk '{s+=$2}END{print s, s/NR}'`
echo "max infer time: $maxtime"
echo "Average infer fps: $inffps"
echo "max load time: $loadtime"
echo "Average load fps: $loadfps"
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
from apex.optimizers import FusedLAMB
import apex_C
from apex import amp
import migraphx
import torch.nn as nn
import torch.optim as optim
# from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.snippets import sequence_padding, ListDataset
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from bert4torch.models import BaseModelDDP
import os
import time
import multiprocessing as mp
from multiprocessing import Process, Queue, Manager
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
maxlen = 256
batch_size = 64
config_path = '/datasets/bert-base-chinese/config.json'
dict_path = '/datasets/bert-base-chinese/vocab.txt'
gpuid = os.getenv('HIP_VISIBLE_DEVICES')
labdir = os.path.join('results', gpuid, 'label')
resultdir = os.path.join('results', gpuid, 'data')
os.makedirs(resultdir, exist_ok=True)
os.makedirs(labdir, exist_ok=True)
def AllocateOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
if os.path.isfile("/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/models/bert_best_mha_md5.mxr"):
print("***********load mxr model******************")
model = migraphx.load("/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/models/bert_best_mha_md5.mxr")
else:
print("***********load onnx model******************")
# 加载模型
maxInput={"input":[64,256]}
model = migraphx.parse_onnx("/models/bert_best.onnx", map_input_dims=maxInput)
migraphx.quantize_fp16(model)
# 编译
model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
inputName=list(model.get_inputs().keys())[0]
modelData=AllocateOutputMemory(model)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
maxlen = 256
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen) # 截断到 maxlen
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
# 初始化 labels 为全 0(或根据你的设计,可能是 'O' 标签)
labels = np.zeros(len(token_ids), dtype=np.int64)
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start_idx = start_mapping[start]
end_idx = end_mapping[end]
labels[start_idx] = categories_label2id['B-' + label]
labels[start_idx + 1:end_idx + 1] = categories_label2id['I-' + label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen, value=tokenizer._token_pad_id),
dtype=torch.long,
device="cuda:0")
batch_labels = torch.tensor(sequence_padding(batch_labels, length=maxlen, value=0),
dtype=torch.long,
device="cuda:0")
return batch_token_ids, batch_labels
# 转换数据集
valid_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
def pad_data(data, seq=256):
if len(data.shape) == 1:
return np.pad(data, ((0, seq-data.shape[0])),
"constant", constant_values=(0))
elif len(data.shape) == 2:
return np.pad(data, ((0, 0), (0, seq-data.shape[1])),
"constant", constant_values=(0))
else:
# shape(bs, seq, len(categories))
return np.pad(data, ((0, 0), (0, seq-data.shape[1]), (0, 0)),
"constant", constant_values=(0))
def pad_data_bin(data, output, bs, seq=256, len_catagory=7):
if output == "emission_score":
data = data.reshape((bs, -1, len_catagory))
else:
data = data.reshape((bs, -1))
return pad_data(data, seq)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
end = 0
infer_times = []
total_infer_times = []
data_idx = 0
total_start = time.time()
# warmup
for token_ids, label in tqdm(data):
data_numpy=token_ids.detach().cpu().numpy()
img_data = np.zeros(data_numpy.shape).astype("int64")
for i in range(data_numpy.shape[0]):
img_data[i, :] = data_numpy[i, :]
modelData[inputName] = migraphx.to_gpu(migraphx.argument(img_data))
preds_dcu = model.run(modelData)
break
for token_ids, label in tqdm(data):
data_numpy=token_ids.detach().cpu().numpy()
# device = torch.device("cuda")
# 注意:这里需要执行赋值操作,否则会造成migraphx中输入数据步长不对
img_data = np.zeros(data_numpy.shape).astype("int64")
for i in range(data_numpy.shape[0]):
img_data[i, :] = data_numpy[i, :]
if img_data.shape[0] != 64:
break
modelData[inputName] = migraphx.to_gpu(migraphx.argument(img_data))
start = time.time()
# result = model.run({"input":img_data})
preds_dcu = model.run(modelData)
end += time.time() - start
infer_times.append(time.time() - start)
print(f"****infer time: {infer_times[-1]} s***** fps: {64/infer_times[-1]}*********")
total_infer_times.append(time.time() - total_start)
result_1 = np.array(migraphx.from_gpu(preds_dcu[0]))
result_2 = np.array(migraphx.from_gpu(preds_dcu[1]))
emission_score = torch.from_numpy(np.array(result_1, copy=False))
attention_mask = torch.from_numpy(np.array(result_2, copy=False))
labels = label.cpu().numpy()
# emission_score = torch.from_numpy(np.array(result[0], copy=False))
# attention_mask = torch.from_numpy(np.array(result[1], copy=False))
# 保存bin文件
labels = np.pad(labels, ((0, batch_size-labels.shape[0]), (0,0)), 'constant', constant_values=-1)
labels.tofile(f'{labdir}/{data_idx}.bin')
emission_score = np.pad(emission_score, ((0, batch_size-emission_score.shape[0]), (0,0), (0,0)), 'constant')
attention_mask = np.pad(attention_mask, ((0, batch_size-attention_mask.shape[0]), (0,0)), 'constant')
emission_score.tofile(f'{resultdir}/{data_idx}_0.bin')
attention_mask.tofile(f'{resultdir}/{data_idx}_1.bin')
labels = pad_data_bin(labels, "labels", batch_size)
emission_score = pad_data_bin(emission_score, "emission_score", batch_size)
attention_mask = pad_data_bin(attention_mask, "attention_mask", batch_size)
labels = torch.Tensor(labels)
# mask last data
data_mask = labels[:, 0] >= 0
labels = labels[data_mask]
emission_score = torch.Tensor(emission_score)[data_mask]
attention_mask = torch.Tensor(attention_mask)[data_mask]
scores = crf.decode(emission_score, attention_mask)
true_label = []
for label in labels:
true_label += [categories_id2label[int(l)] for l in label if l != 0]
attention_mask = labels.gt(0)
# token粒度
X += (scores.eq(labels) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += labels.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(labels)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
data_idx += 1
total_start = time.time()
print("total_sample_data:", (64 * data_idx))
avg_infer_fps = 64 * len(infer_times) / sum(infer_times)
print(f"total_infer_time: {end}s")
print(f'avg_infer_fps: {avg_infer_fps}samples/s')
load_data_infer_time = sum(total_infer_times)
load_data_avg_infer_fps = len(total_infer_times) * 64 / sum(total_infer_times)
print(f'load_data_total_infer_time: {load_data_infer_time}s')
print(f'load_data_avg_total_Infer_fps: {load_data_avg_infer_fps} samples/s')
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
# 跳过 padding / ignore_index (-100)
if item == 0:
continue
# 安全地获取标签名(确保 key 是 int)
tag_id = int(item.item()) # 转为 int,避免 float key
flag_tag = categories_id2label[tag_id]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:] == entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for ent in entity_ids:
if ent: # 非空才加入
batch_entity_ids.add(tuple(ent))
return batch_entity_ids
class Model(BaseModel):
def __init__(self, config_path):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=None, segment_vocab_size=0)
# embedding_dims:768, len_categories: 7
self.fc = nn.Linear(768, 7) # 包含首尾
self.crf = CRF(7)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def build_model(config_path, checkpoint_path):
model = Model(config_path).to("cpu")
model.load_weights(checkpoint_path, strict=False)
return model
if __name__ == '__main__':
ptmodel = build_model("/datasets/bert-base-chinese/config.json", "/models/best_model.pt")
crf = ptmodel.crf
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f}\n')
# time_fw写入程序开始执行的时间
time_fw.write('End Time: {:.6f}\n'.format(time.time()))
time_fw.flush()
time_fw.close()
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06 entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
from apex.optimizers import FusedLAMB
import apex_C
from apex import amp
import migraphx
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from bert4torch.models import BaseModelDDP
import os
import time
maxlen = 256
batch_size = 64
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = '/datasets/bert-base-chinese/config.json'
dict_path = '/datasets/bert-base-chinese/vocab.txt'
device = "cuda"
gpuid = os.getenv('HIP_VISIBLE_DEVICES')
labdir = os.path.join('results', gpuid, 'label')
resultdir = os.path.join('results', gpuid, 'data')
os.makedirs(resultdir, exist_ok=True)
os.makedirs(labdir, exist_ok=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
if os.path.isfile("/workspace/bert4torch/examples/sequence_labeling/bert_best.mxr"):
model = migraphx.load("/workspace/bert4torch/examples/sequence_labeling/bert_best.mxr")
else:
# 加载模型
maxInput={"input":[64,256]}
model = migraphx.parse_onnx("/workspace/bert4torch/examples/sequence_labeling/bert_best.onnx", map_input_dims=maxInput)
migraphx.quantize_fp16(model)
# 编译
model.compile(migraphx.get_target("gpu"),device_id=0)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
valid_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn, shuffle=False, drop_last=True)
crf = CRF(len(categories)).to(device)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
end = 0
infer_times = []
total_infer_times = []
data_idx = 0
total_start = time.time()
for token_ids, label in tqdm(data):
data_numpy=token_ids.detach().cpu().numpy()
device = torch.device("cuda")
# 注意:这里需要执行赋值操作,否则会造成migraphx中输入数据步长不对
img_data = np.zeros(data_numpy.shape).astype("int64")
for i in range(data_numpy.shape[0]):
img_data[i, :] = data_numpy[i, :]
start = time.time()
result = model.run({"input":img_data})
end += time.time() - start
infer_times.append(time.time() - start)
total_infer_times.append(time.time() - total_start)
emission_score = torch.from_numpy(np.array(result[0], copy=False)).to(device)
attention_mask = torch.from_numpy(np.array(result[1], copy=False)).to(device)
scores = crf.decode(emission_score, attention_mask)
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
# 保存bin文件
score = np.array(result[0])
score = np.pad(score, ((0, batch_size-emission_score.shape[0]), (0,0), (0,0)), 'constant')
mask = np.array(result[1])
mask = np.pad(mask, ((0, batch_size-emission_score.shape[0]), (0,0)), 'constant')
#score = np.pad(np.array(result[0]), ((0, batch_size-emission_score.shape[0]), (0,0), (0,0), 'constant'))
#mask = np.pad(np.array(result[1]), ((0, batch_size-emission_score.shape[0]), (0,0), (0,0), 'constant'))
label = label.cpu().numpy()
label = np.pad(label, ((0, batch_size-emission_score.shape[0]), (0,0)), 'constant', constant_values=-1)
score.tofile(f'{resultdir}/{data_idx}_0.bin')
mask.tofile(f'{resultdir}/{data_idx}_1.bin')
label.tofile(f'{labdir}/{data_idx}.bin')
#np.array(result[0], copy=False).tofile(f'{resultdir}/{data_idx}_0.bin')
#np.array(result[1], copy=False).tofile(f'{resultdir}/{data_idx}_1.bin')
#label.cpu().numpy().tofile(f'{labdir}/{data_idx}.bin')
data_idx += 1
total_start = time.time()
print("total_sample_data:", (64 * data_idx))
#avg_infer_time = sum(infer_times[1:]) / len(infer_times[1:])
avg_infer_time = 64 * len(infer_times) / sum(infer_times)
print(f"total_infer_time: {end}s")
print(f'avg_infer_fps: {avg_infer_time}samples/s')
load_data_infer_time = sum(total_infer_times)
load_data_avg_infer_time = len(total_infer_times) * 64 / sum(total_infer_times)
print(f'load_data_total_infer_time: {load_data_infer_time}s')
print(f'load_data_avg_total_Infer_fps: {load_data_avg_infer_time} samples/s')
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
if __name__ == '__main__':
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f}\n')
# time_fw写入程序开始执行的时间
time_fw.write('End Time: {:.6f}\n'.format(time.time()))
time_fw.flush()
time_fw.close()
import os
import numpy as np
from sklearn.metrics import classification_report
# ===================== 没有载入model.pt,结果偏低 =====================
result_dir = "results/0"
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
maxlen = 256
batch_size = 64
label_dir = os.path.join(result_dir, "label")
data_dir = os.path.join(result_dir, "data")
# ===================== 加载标签 =====================
def load_all_labels():
y_true = []
files = sorted(os.listdir(label_dir), key=lambda x: int(os.path.splitext(x)[0]))
for f in files:
arr = np.fromfile(os.path.join(label_dir, f), dtype=np.int64).reshape(-1, maxlen)
for seq in arr:
y_true.extend(seq[seq != -1].tolist())
return y_true
# ===================== 加载预测 =====================
def load_all_preds():
y_pred = []
bin_files = sorted([f for f in os.listdir(data_dir) if f.endswith("_0.bin")], key=lambda x: int(x.split("_")[0]))
for fname in bin_files:
idx = int(fname.split("_")[0])
emit = np.fromfile(os.path.join(data_dir, f"{idx}_0.bin"), dtype=np.float32).reshape(batch_size, maxlen, 7)
pred = np.argmax(emit, axis=-1)
for seq in pred:
y_pred.extend(seq.tolist())
return y_pred
# ===================== 合并BIO =====================
def merge_bio(seq):
res = []
for x in seq:
if x in (1,2): res.append("LOC")
elif x in (3,4): res.append("PER")
elif x in (5,6): res.append("ORG")
else: res.append("O")
return res
# ===================== 主程序 =====================
if __name__ == "__main__":
y_true = load_all_labels()
y_pred = load_all_preds()
L = min(len(y_true), len(y_pred))
y_true = y_true[:L]
y_pred = y_pred[:L]
y_true_ent = merge_bio(y_true)
y_pred_ent = merge_bio(y_pred)
print("\n" + "="*60)
print(classification_report(y_true_ent, y_pred_ent, digits=4))
print("="*60)
\ No newline at end of file
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
from apex.optimizers import FusedLAMB
import apex_C
from apex import amp
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from bert4torch.models import BaseModelDDP
import os
import time
maxlen = 256
batch_size = 64
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = '/datasets/bert-base-chinese/config.json'
checkpoint_path = "/models/best_model.pt"
dict_path = '/datasets/bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#local_rank = int(os.environ['LOCAL_RANK'])
#print("local_rank ", local_rank)
#torch.cuda.set_device(local_rank)
#device = torch.device("cuda", local_rank)
#torch.distributed.init_process_group(backend='nccl')
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
#train_dataset = MyDataset('/workspace/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
#train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
## 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.module.crf(*outputs, labels)
#try to use apex
optimizer = optim.Adam(model.parameters(), lr=6e-5)
#model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", master_weights=True, verbosity=0)
#model = BaseModelDDP(model, master_rank=0, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=False)
model.compile(
loss=Loss(),
optimizer=optimizer,
# use_apex=True, #此处设置是否采用apex_amp的混合精度
)
#------------------------------------------------------------
def evaluate(data):
# for token_ids, label in tqdm(data):
# #torch.onnx.export(model.module, token_ids, "./bert_best.onnx", opset_version=13,
# torch.onnx.export(model, token_ids, "./bert_best.onnx", opset_version=13,
# input_names=['input'],
# output_names=['output'],
# dynamic_axes={'input': {1: 'token'}}) # 第一维可变,第0维默认维batch
# print("完成onnx模型转换")
# break
model.eval()
dummy_input = torch.randint(1, 2000, size=(64, 256), dtype=torch.long, device=device)
torch.onnx.export(
model,
dummy_input,
"/models/onnx-models/bert_best_static.onnx",
opset_version=13,
input_names=["input"],
output_names=["emission_scores", "attention_mask"], # 更准确的输出名
do_constant_folding=True,
)
print("✅ 静态 ONNX 导出完成!")
# for token_ids, label in tqdm(data):
# #torch.onnx.export(model.module, token_ids, "./bert_best.onnx", opset_version=13,
# torch.onnx.export(model, token_ids, "./bert_best_1.onnx", opset_version=13,
# input_names=['input'],
# output_names=['output'],
# dynamic_axes={'input': {1: 'token'}}, # 第一维可变,第0维默认维batch
# do_constant_folding=True) # 启用常量折叠,减少运行时计算
# print("完成onnx模型转换")
# break
if __name__ == '__main__':
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
model.load_weights("/models/best_model.pt")
evaluate(valid_dataloader)
import os
import json
import argparse
from tqdm import tqdm
import numpy as np
import torch
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2
import torch.nn as nn
from bert4torch.layers import CRF
from bert4torch.models import build_transformer_model, BaseModel
""" 运行命令
python bertbase_postprocess.py -i results/0/data -l results/0/label -o output -cbert-base-chinese/config.json -k best_model.pt
"""
class Model(BaseModel):
def __init__(self, config_path):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=None, segment_vocab_size=0)
# embedding_dims:768, len_categories: 7
self.fc = nn.Linear(768, 7) # 包含首尾
self.crf = CRF(7)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def build_model(config_path, checkpoint_path):
model = Model(config_path).to("cpu")
model.load_weights(checkpoint_path, strict=False)
return model
def pad_data(data, seq=256):
if len(data.shape) == 1:
return np.pad(data, ((0, seq-data.shape[0])),
"constant", constant_values=(0))
elif len(data.shape) == 2:
return np.pad(data, ((0, 0), (0, seq-data.shape[1])),
"constant", constant_values=(0))
else:
# shape(bs, seq, len(categories))
return np.pad(data, ((0, 0), (0, seq-data.shape[1]), (0, 0)),
"constant", constant_values=(0))
def pad_data_npy(path, seq=256):
return pad_data(np.load(path), seq)
def pad_data_bin(path, output, bs, seq=256, len_catagory=7):
data = None
if output == "emission_score":
data = np.fromfile(path, dtype=np.float32).reshape((bs, -1, len_catagory))
else:
data = np.fromfile(path, dtype=np.int64).reshape((bs, -1))
return pad_data(data, seq)
def evaluate(result_dir, label_dir, bs):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
true_labels, true_predictions = [], []
data_num = len(os.listdir(label_dir))
emission_score = None
labels = None
attention_mask = None
for data_idx in tqdm(range(data_num)):
emission_score_path = [
os.path.join(result_dir, f"{data_idx}_0.{fmt}") for fmt in ["npy", "bin"]
]
if os.path.exists(emission_score_path[0]):
emission_score = pad_data_npy(emission_score_path[0])
else:
print(emission_score_path[1])
emission_score = pad_data_bin(emission_score_path[1], "emission_score", bs)
attention_mask_path = [
os.path.join(result_dir, f"{data_idx}_1.{fmt}") for fmt in ["npy", "bin"]
]
if os.path.exists(attention_mask_path[0]):
attention_mask = pad_data_npy(attention_mask_path[0])
else:
attention_mask = pad_data_bin(attention_mask_path[1], "attention_mask", bs)
label_path = [
os.path.join(label_dir, f"{data_idx}.{fmt}") for fmt in ["npy", "bin"]
]
if os.path.exists(label_path[0]):
labels = pad_data_npy(label_path[0])
else:
labels = pad_data_bin(label_path[1], "labels", bs)
labels = torch.Tensor(labels)
# mask last data
data_mask = labels[:, 0] >= 0
labels = labels[data_mask]
emission_score = torch.Tensor(emission_score)[data_mask]
attention_mask = torch.Tensor(attention_mask)[data_mask]
scores = crf.decode(emission_score, attention_mask)
true_label = []
for label in labels:
true_label += [categories_id2label[int(l)] for l in label if l != -100]
true_labels.append(true_label)
true_prediction = []
for score in scores:
true_prediction += [categories_id2label[int(p)] for p in score if p != -100]
true_predictions.append(true_prediction)
attention_mask = labels.gt(0)
# token粒度
X += (scores.eq(labels) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += labels.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(labels)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
eval_result = classification_report(true_labels,
true_predictions,
digits=4,
mode='strict',
scheme=IOB2)
print(eval_result)
f1, p1, r1 = 2 * X / (Y + Z), X / Y, X / Z
f2, p2, r2 = 2 * X2 / (Y2 + Z2), X2 / Y2, X2 / Z2
print("val-token level: f1:{}, precision: {}, recall:{}".format(f1, p1, r1))
print("val-entity level: f1:{}, precision: {}, recall:{}".format(f2, p2, r2))
result_dict = {
"seqeval_result": eval_result,
"val-token level": {
"f1": f1,
"precision": p1,
"recall": r1
},
"val-entity level": {
"f1": f2,
"precision": p2,
"recall": r2
}
}
return result_dict
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and \
(flag_tag[2:] == entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
def parse_arguments():
parser = argparse.ArgumentParser(description='Bert_Base_Chinese postprocess for sequence labeling task.')
parser.add_argument('-i', '--result_dir', type=str, required=True,
help='result dir for prediction results')
parser.add_argument('-o', '--out_path', type=str, required=True,
help='save path for evaluation result')
parser.add_argument('-l', '--label_dir', type=str, required=True,
help='label dir for label results')
parser.add_argument('-c', '--config_path', type=str, required=True,
help='config path for export model')
parser.add_argument('-k', '--ckpt_path', type=str, default="./best_model.pt",
help='result dir for prediction results')
parser.add_argument('-bs', '--batch_size', type=int, default=64,
help='Batch size of output data.')
arguments = parser.parse_args()
arguments.out_path = os.path.abspath(arguments.out_path)
dir_name = os.path.dirname(arguments.out_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
return arguments
if __name__ == '__main__':
args = parse_arguments()
model = build_model(args.config_path, args.ckpt_path)
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
crf = model.crf
evaluate_results = evaluate(args.result_dir, args.label_dir, args.batch_size)
with open(args.out_path, 'w') as f:
json.dump(evaluate_results, f, ensure_ascii=False, indent=4)
python3 -m onnxruntime.transformers.optimizer \
--input /home/sunzhq/workspace/onnx_models/bert/bert_best.onnx\
--output /home/sunzhq/workspace/onnx_models/bert/bert_best_fused.onnx \
--use_multi_head_attention \
--num_heads 12 \
--hidden_size 768 \
--model_type bert \
--disable_skip_layer_norm \
--disable_gelu \
--use_gpu \
--disable_embed_layer_norm \
--use_mask_index \
--use_raw_attention_mask
# --no_attention_mask \
\ No newline at end of file
source /opt/dtk/env.sh
export HIP_PRINTF_DEBUG_FOR_FP64=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIGRAPHX_ENABLE_GEMM_SOFTMAX_GEMM_FUSE=1 # 性能提升,影响精度
export MIGRAPHX_ENABLE_MHA=1
export HIP_VISIBLE_DEVICES=3
numactl -N 3 -m 3 python bert_migraphx.py
# export HIP_VISIBLE_DEVICES=0
# nohup numactl -N 0 -m 0 python bert_migraphx.py 2>&1 | tee result_0.log &
# export HIP_VISIBLE_DEVICES=1
# nohup numactl -N 1 -m 1 python bert_migraphx.py 2>&1 | tee result_1.log &
# export HIP_VISIBLE_DEVICES=2
# nohup numactl -N 2 -m 2 python bert_migraphx.py 2>&1 | tee result_2.log &
# export HIP_VISIBLE_DEVICES=3
# nohup numactl -N 3 -m 3 python bert_migraphx.py 2>&1 | tee result_3.log &
# export HIP_VISIBLE_DEVICES=4
# nohup python bert_migraphx.py 2>&1 | tee result_4.log &
# export HIP_VISIBLE_DEVICES=5
# nohup python bert_migraphx.py 2>&1 | tee result_5.log &
# export HIP_VISIBLE_DEVICES=6
# nohup python bert_migraphx.py 2>&1 | tee result_6.log &
# export HIP_VISIBLE_DEVICES=7
# nohup python bert_migraphx.py 2>&1 | tee result_7.log &
{
"seqeval_result": " precision recall f1-score support\n\n LOC 0.9662 0.9703 0.9683 1887\n ORG 0.9289 0.9431 0.9360 984\n PER 0.9673 0.9706 0.9689 884\n\n micro avg 0.9566 0.9632 0.9599 3755\n macro avg 0.9542 0.9613 0.9577 3755\nweighted avg 0.9567 0.9632 0.9600 3755\n",
"val-token level": {
"f1": 0.9724224643755242,
"precision": 0.9683639398998333,
"recall": 0.9765151515151517
},
"val-entity level": {
"f1": 0.9599256900212325,
"precision": 0.9566252314202603,
"recall": 0.9632490013315589
}
}
\ No newline at end of file
{
"seqeval_result": " precision recall f1-score support\n\n LOC 0.9691 0.9712 0.9702 1876\n ORG 0.9286 0.9409 0.9347 982\n PER 0.9659 0.9692 0.9675 876\n\n micro avg 0.9576 0.9628 0.9602 3734\n macro avg 0.9546 0.9604 0.9575 3734\nweighted avg 0.9577 0.9628 0.9602 3734\n",
"val-token level": {
"f1": 0.9726518056550506,
"precision": 0.9692617787855886,
"recall": 0.9760656292286877
},
"val-entity level": {
"f1": 0.9602029914529925,
"precision": 0.957645178476293,
"recall": 0.9627745045527595
}
}
\ No newline at end of file
for index in {0..7}
do
python bertbase_postprocess.py -i results/${index}/data -l results/${index}/label -o output -cbert-base-chinese/config.json -k best_model.pt
done
#! -*- coding:utf-8 -*-
# W2NER: https://github.com/ljynlp/W2NER
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.layers import LayerNorm
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import defaultdict, deque
from sklearn.metrics import precision_recall_fscore_support
# 模型参数:训练
epochs = 20 # 训练轮数
steps_per_epoch = 100 # 每轮步数
maxlen = 256 # 最大长度
batch_size = 8 # 根据gpu显存设置
learning_rate = 1e-3
clip_grad_norm = 5.0
bert_learning_rate = 5e-6
warm_factor = 0.1
weight_decay = 0
use_bert_last_4_layers = True
categories = {'LOC':2, 'PER':3, 'ORG':4}
label_num = len(categories) + 2
# 模型参数:网络结构
dist_emb_size = 20
type_emb_size = 20
lstm_hid_size = 512
conv_hid_size = 96
bert_hid_size = 768
biaffine_size = 512
ffnn_hid_size = 288
dilation = [1, 2, 3]
emb_dropout = 0.5
conv_dropout = 0.5
out_dropout = 0.33
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 相对距离设置
dis2idx = np.zeros((1000), dtype='int64')
dis2idx[1] = 1
dis2idx[2:] = 2
dis2idx[4:] = 3
dis2idx[8:] = 4
dis2idx[16:] = 5
dis2idx[32:] = 6
dis2idx[64:] = 7
dis2idx[128:] = 8
dis2idx[256:] = 9
# 用到的小函数
def convert_index_to_text(index, type):
text = "-".join([str(i) for i in index])
text = text + "-#-{}".format(type)
return text
def convert_text_to_index(text):
index, type = text.split("-#-")
index = [int(x) for x in index.split("-")]
return index, int(type)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in tqdm(f.split('\n\n'), desc='Load data'):
if not l:
continue
sentence, d = [], []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
sentence += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
if len(sentence) > maxlen - 2:
continue
tokens = [tokenizer.tokenize(word)[1:-1] for word in sentence[:maxlen-2]]
pieces = [piece for pieces in tokens for piece in pieces]
tokens_ids = [tokenizer._token_start_id] + tokenizer.tokens_to_ids(pieces) + [tokenizer._token_end_id]
assert len(tokens_ids) <= maxlen
length = len(tokens)
# piece和word的对应关系,中文两者一致,除了[CLS]和[SEP]
_pieces2word = np.zeros((length, len(tokens_ids)), dtype=np.bool)
e_start = 0
for i, pieces in enumerate(tokens):
if len(pieces) == 0:
continue
pieces = list(range(e_start, e_start + len(pieces)))
_pieces2word[i, pieces[0] + 1:pieces[-1] + 2] = 1
e_start += len(pieces)
# 相对距离
_dist_inputs = np.zeros((length, length), dtype=np.int)
for k in range(length):
_dist_inputs[k, :] += k
_dist_inputs[:, k] -= k
for i in range(length):
for j in range(length):
if _dist_inputs[i, j] < 0:
_dist_inputs[i, j] = dis2idx[-_dist_inputs[i, j]] + 9
else:
_dist_inputs[i, j] = dis2idx[_dist_inputs[i, j]]
_dist_inputs[_dist_inputs == 0] = 19
# golden标签
_grid_labels = np.zeros((length, length), dtype=np.int)
_grid_mask2d = np.ones((length, length), dtype=np.bool)
for entity in d:
e_start, e_end, e_type = entity[0], entity[1]+1, entity[-1]
if e_end >= maxlen - 2:
continue
index = list(range(e_start, e_end))
for i in range(len(index)):
if i + 1 >= len(index):
break
_grid_labels[index[i], index[i + 1]] = 1
_grid_labels[index[-1], index[0]] = categories[e_type]
_entity_text = set([convert_index_to_text(list(range(e[0], e[1]+1)), categories[e[-1]]) for e in d])
D.append((tokens_ids, _pieces2word, _dist_inputs, _grid_labels, _grid_mask2d, _entity_text))
return D
def collate_fn(data):
tokens_ids, pieces2word, dist_inputs, grid_labels, grid_mask2d, _entity_text = map(list, zip(*data))
sent_length = torch.tensor([i.shape[0] for i in pieces2word], dtype=torch.long, device=device)
# max_wordlen: word长度,非token长度,max_tokenlen:token长度
max_wordlen = torch.max(sent_length).item()
max_tokenlen = np.max([len(x) for x in tokens_ids])
tokens_ids = torch.tensor(sequence_padding(tokens_ids), dtype=torch.long, device=device)
batch_size = tokens_ids.size(0)
def fill(data, new_data):
for j, x in enumerate(data):
new_data[j, :x.shape[0], :x.shape[1]] = torch.tensor(x, dtype=torch.long, device=device)
return new_data
dis_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.long, device=device)
dist_inputs = fill(dist_inputs, dis_mat)
labels_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.long, device=device)
grid_labels = fill(grid_labels, labels_mat)
mask2d_mat = torch.zeros((batch_size, max_wordlen, max_wordlen), dtype=torch.bool, device=device)
grid_mask2d = fill(grid_mask2d, mask2d_mat)
sub_mat = torch.zeros((batch_size, max_wordlen, max_tokenlen), dtype=torch.bool, device=device)
pieces2word = fill(pieces2word, sub_mat)
return [tokens_ids, pieces2word, dist_inputs, sent_length, grid_mask2d], [grid_labels, grid_mask2d, _entity_text]
# 加载数据
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class ConvolutionLayer(nn.Module):
'''卷积层
'''
def __init__(self, input_size, channels, dilation, dropout=0.1):
super(ConvolutionLayer, self).__init__()
self.base = nn.Sequential(
nn.Dropout2d(dropout),
nn.Conv2d(input_size, channels, kernel_size=1),
nn.GELU(),
)
self.convs = nn.ModuleList(
[nn.Conv2d(channels, channels, kernel_size=3, groups=channels, dilation=d, padding=d) for d in dilation])
def forward(self, x):
x = x.permute(0, 3, 1, 2).contiguous()
x = self.base(x)
outputs = []
for conv in self.convs:
x = conv(x)
x = F.gelu(x)
outputs.append(x)
outputs = torch.cat(outputs, dim=1)
outputs = outputs.permute(0, 2, 3, 1).contiguous()
return outputs
class Biaffine(nn.Module):
'''仿射变换
'''
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
super(Biaffine, self).__init__()
self.n_in = n_in
self.n_out = n_out
self.bias_x = bias_x
self.bias_y = bias_y
weight = torch.zeros((n_out, n_in + int(bias_x), n_in + int(bias_y)))
nn.init.xavier_normal_(weight)
self.weight = nn.Parameter(weight, requires_grad=True)
def extra_repr(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.bias_x:
s += f", bias_x={self.bias_x}"
if self.bias_y:
s += f", bias_y={self.bias_y}"
return s
def forward(self, x, y):
if self.bias_x:
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
if self.bias_y:
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
s = s.permute(0, 2, 3, 1)
return s
class MLP(nn.Module):
'''MLP全连接
'''
def __init__(self, n_in, n_out, dropout=0):
super().__init__()
self.linear = nn.Linear(n_in, n_out)
self.activation = nn.GELU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.dropout(x)
x = self.linear(x)
x = self.activation(x)
return x
class CoPredictor(nn.Module):
def __init__(self, cls_num, hid_size, biaffine_size, channels, ffnn_hid_size, dropout=0):
super().__init__()
self.mlp1 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.mlp2 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.biaffine = Biaffine(n_in=biaffine_size, n_out=cls_num, bias_x=True, bias_y=True)
self.mlp_rel = MLP(channels, ffnn_hid_size, dropout=dropout)
self.linear = nn.Linear(ffnn_hid_size, cls_num)
self.dropout = nn.Dropout(dropout)
def forward(self, x, y, z):
h = self.dropout(self.mlp1(x))
t = self.dropout(self.mlp2(y))
o1 = self.biaffine(h, t)
z = self.dropout(self.mlp_rel(z))
o2 = self.linear(z)
return o1 + o2
class Model(BaseModel):
def __init__(self, use_bert_last_4_layers=False):
super().__init__()
self.use_bert_last_4_layers = use_bert_last_4_layers
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, # segment_vocab_size=0,
output_all_encoded_layers = True if use_bert_last_4_layers else False)
lstm_input_size = self.bert.configs['hidden_size']
self.dis_embs = nn.Embedding(20, dist_emb_size)
self.reg_embs = nn.Embedding(3, type_emb_size)
self.encoder = nn.LSTM(lstm_input_size, lstm_hid_size // 2, num_layers=1, batch_first=True,
bidirectional=True)
conv_input_size = lstm_hid_size + dist_emb_size + type_emb_size
self.convLayer = ConvolutionLayer(conv_input_size, conv_hid_size, dilation, conv_dropout)
self.dropout = nn.Dropout(emb_dropout)
self.predictor = CoPredictor(label_num, lstm_hid_size, biaffine_size,
conv_hid_size * len(dilation), ffnn_hid_size, out_dropout)
self.cln = LayerNorm(lstm_hid_size, conditional_size=lstm_hid_size)
def forward(self, token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d):
bert_embs = self.bert([token_ids, torch.zeros_like(token_ids)])
if self.use_bert_last_4_layers:
bert_embs = torch.stack(bert_embs[-4:], dim=-1).mean(-1)
length = pieces2word.size(1)
min_value = torch.min(bert_embs).item()
# 最大池化
_bert_embs = bert_embs.unsqueeze(1).expand(-1, length, -1, -1)
_bert_embs = torch.masked_fill(_bert_embs, pieces2word.eq(0).unsqueeze(-1), min_value)
word_reps, _ = torch.max(_bert_embs, dim=2)
# LSTM
word_reps = self.dropout(word_reps)
packed_embs = pack_padded_sequence(word_reps, sent_length.cpu(), batch_first=True, enforce_sorted=False)
packed_outs, (hidden, _) = self.encoder(packed_embs)
word_reps, _ = pad_packed_sequence(packed_outs, batch_first=True, total_length=sent_length.max())
# 条件LayerNorm
cln = self.cln([word_reps.unsqueeze(2), word_reps])
# concat
dis_emb = self.dis_embs(dist_inputs)
tril_mask = torch.tril(grid_mask2d.clone().long())
reg_inputs = tril_mask + grid_mask2d.clone().long()
reg_emb = self.reg_embs(reg_inputs)
conv_inputs = torch.cat([dis_emb, reg_emb, cln], dim=-1)
# 卷积层
conv_inputs = torch.masked_fill(conv_inputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
conv_outputs = self.convLayer(conv_inputs)
conv_outputs = torch.masked_fill(conv_outputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
# 输出层
outputs = self.predictor(word_reps, word_reps, conv_outputs)
return outputs
model = Model(use_bert_last_4_layers).to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
grid_labels, grid_mask2d, _ = labels
grid_mask2d = grid_mask2d.clone()
return super().forward(outputs[grid_mask2d], grid_labels[grid_mask2d])
bert_params = set(model.bert.parameters())
other_params = list(set(model.parameters()) - bert_params)
no_decay = ['bias', 'LayerNorm.weight']
params = [
{'params': [p for n, p in model.bert.named_parameters() if not any(nd in n for nd in no_decay)],
'lr': bert_learning_rate,
'weight_decay': weight_decay},
{'params': [p for n, p in model.bert.named_parameters() if any(nd in n for nd in no_decay)],
'lr': bert_learning_rate,
'weight_decay': 0.0},
{'params': other_params,
'lr': learning_rate,
'weight_decay': weight_decay},
]
optimizer = optim.Adam(params, lr=learning_rate, weight_decay=weight_decay)
updates_total = (len(train_dataloader) if steps_per_epoch is None else steps_per_epoch) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_factor * updates_total, num_training_steps=updates_total)
model.compile(loss=Loss(), optimizer=optimizer, scheduler=scheduler, clip_grad_norm=5.0)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, p, r, e_f1, e_p, e_r = self.evaluate(valid_dataloader)
if e_f1 > self.best_val_f1:
self.best_val_f1 = e_f1
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {p:.5f} r: {r:.5f}')
print(f'[val-entity level] f1: {e_f1:.5f}, p: {e_p:.5f} r: {e_r:.5f} best_f1: {self.best_val_f1:.5f}\n')
def evaluate(self, data_loader):
def cal_f1(c, p, r):
if r == 0 or p == 0:
return 0, 0, 0
r = c / r if r else 0
p = c / p if p else 0
if r and p:
return 2 * p * r / (p + r), p, r
return 0, p, r
pred_result = []
label_result = []
total_ent_r = 0
total_ent_p = 0
total_ent_c = 0
for data_batch in tqdm(data_loader, desc='Evaluate'):
(token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d), (grid_labels, grid_mask2d, entity_text) = data_batch
outputs = model.predict([token_ids, pieces2word, dist_inputs, sent_length, grid_mask2d])
grid_mask2d = grid_mask2d.clone()
outputs = torch.argmax(outputs, -1)
ent_c, ent_p, ent_r, _ = self.decode(outputs.cpu().numpy(), entity_text, sent_length.cpu().numpy())
total_ent_r += ent_r
total_ent_p += ent_p
total_ent_c += ent_c
grid_labels = grid_labels[grid_mask2d].contiguous().view(-1)
outputs = outputs[grid_mask2d].contiguous().view(-1)
label_result.append(grid_labels.cpu())
pred_result.append(outputs.cpu())
label_result = torch.cat(label_result)
pred_result = torch.cat(pred_result)
p, r, f1, _ = precision_recall_fscore_support(label_result.numpy(), pred_result.numpy(), average="macro")
e_f1, e_p, e_r = cal_f1(total_ent_c, total_ent_p, total_ent_r)
return f1, p, r, e_f1, e_p, e_r
def decode(self, outputs, entities, length):
class Node:
def __init__(self):
self.THW = [] # [(tail, type)]
self.NNW = defaultdict(set) # {(head,tail): {next_index}}
ent_r, ent_p, ent_c = 0, 0, 0
decode_entities = []
q = deque()
for instance, ent_set, l in zip(outputs, entities, length):
predicts = []
nodes = [Node() for _ in range(l)]
count = 0
for cur in reversed(range(l)):
# if count >= 29:
# print(count)
count += 1
heads = []
for pre in range(cur+1):
# THW
if instance[cur, pre] > 1:
nodes[pre].THW.append((cur, instance[cur, pre]))
heads.append(pre)
# NNW
if pre < cur and instance[pre, cur] == 1:
# cur node
for head in heads:
nodes[pre].NNW[(head,cur)].add(cur)
# post nodes
for head,tail in nodes[cur].NNW.keys():
if tail >= cur and head <= pre:
nodes[pre].NNW[(head,tail)].add(cur)
# entity
for tail,type_id in nodes[cur].THW:
if cur == tail:
predicts.append(([cur], type_id))
continue
q.clear()
q.append([cur])
while len(q) > 0:
chains = q.pop()
for idx in nodes[chains[-1]].NNW[(cur,tail)]:
if idx == tail:
predicts.append((chains + [idx], type_id))
else:
q.append(chains + [idx])
predicts = set([convert_index_to_text(x[0], x[1]) for x in predicts])
decode_entities.append([convert_text_to_index(x) for x in predicts])
ent_r += len(ent_set)
ent_p += len(predicts)
ent_c += len(predicts.intersection(ent_set))
return ent_c, ent_p, ent_r, decode_entities
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf 级联方法,一阶段识别BIO,二阶段识别对应的分类
# 参考博客:https://zhuanlan.zhihu.com/p/166496466
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 98.11; entity_level: 96.23
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['LOC', 'PER', 'ORG']
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels, batch_entity_ids, batch_entity_labels = [], [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
entity_ids, entity_labels = [], []
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = 1 # 标记B
labels[start + 1:end + 1] = 2 # 标记I
entity_ids.append([start, end])
entity_labels.append(categories.index(label)+1)
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
entity_labels.append(0)
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_entity_ids.append(entity_ids)
batch_entity_labels.append(entity_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids], [batch_labels, batch_entity_labels]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dense1 = nn.Linear(768, len(categories))
self.dense2 = nn.Linear(768, len(categories)+1) # 包含padding
self.crf = CRF(len(categories))
def forward(self, inputs):
# 一阶段的输出
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
# 二阶段输出
btz, entity_count, _ = entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
return emission_score, attention_mask, entity_logit
def predict(self, token_ids):
self.eval()
with torch.no_grad():
# 一阶段推理
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
# 二阶段推理
batch_entity_ids = []
for one_samp in best_path:
entity_ids = []
for j, item in enumerate(one_samp):
if item.item() == 1: # B
entity_ids.append([j, j])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and (item.item() == 2): # I
entity_ids[-1][-1] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
batch_entity_ids.append([i for i in entity_ids if i])
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
btz, entity_count, _ = batch_entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
gather_index = batch_entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=gather_index).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
entity_pred = torch.argmax(entity_logit, dim=-1) # [btz, 实体个数]
# 每个元素为一个三元组
entity_tulpe = trans_entity2tuple(batch_entity_ids, entity_pred)
return best_path, entity_tulpe
model = Model().to(device)
class Loss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.loss2 = nn.CrossEntropyLoss(ignore_index=0)
def forward(self, outputs, labels):
emission_score, attention_mask, entity_logit = outputs
seq_labels, entity_labels = labels
loss1 = model.crf(emission_score, attention_mask, seq_labels)
loss2 = self.loss2(entity_logit.reshape(-1, entity_logit.shape[-1]), entity_labels.flatten())
return {'loss': loss1+loss2, 'loss1': loss1, 'loss2': loss2}
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印loss1和loss2
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X1, Y1, Z1 = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, entity_ids), (label, entity_labels) in tqdm(data):
scores, entity_pred = model.predict(token_ids) # [btz, seq_len]
# 一阶段指标: token粒度
attention_mask = label.gt(0)
X1 += (scores.eq(label) * attention_mask).sum().item()
Y1 += scores.gt(0).sum().item()
Z1 += label.gt(0).sum().item()
# 二阶段指标:entity粒度
entity_true = trans_entity2tuple(entity_ids, entity_labels)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X1 / (Y1 + Z1), X1 / Y1, X1 / Z1
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(entity_ids, entity_labels):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
entity_true = set()
for i, one_sample in enumerate(entity_ids):
for j, item in enumerate(one_sample):
if item[0].item() * item[1].item() != 0:
entity_true.add((i, item[0].item(), item[1].item(), entity_labels[i, j].item()))
return entity_true
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-1阶段] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-2阶段] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import time
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
def acc(y_pred, y_true):
y_pred = y_pred[0]
y_pred = torch.argmax(y_pred, dim=-1)
acc = torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
return {'acc': acc}
# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=acc)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
logs["f1"] = f2
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
# time_fw写入程序开始执行的时间
time_fw.write('End Time: {:.6f}\n'.format(time.time()))
time_fw.flush()
time_fw.close()
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 增加词性作为额外的embedding
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.30; entity_level: 96.09
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import jieba.posseg as psg
from collections import Counter
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
psg_map = {v: i+1 for i, v in enumerate(['a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'mg', 'mq', 'n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 't',
'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg'])}
def collate_fn(batch):
batch_token_ids, batch_psg_ids, batch_labels = [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens) # 第i个token在原始text中的区间
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
# 处理词性输入
seg = [(i, p) for word, p in psg.cut(d[0]) for i in word]
seg_word, seg_p = zip(*seg)
psg_ids = np.zeros(len(token_ids))
for i, j in enumerate(mapping):
if j:
start, end = j[0], j[-1] # token在原始text的首尾位置
token_new = (''.join(seg_word[start:end+1])).lower()
assert tokens[i] == token_new, f"{tokens[i]} -> {token_new}"
if start == end:
psg_ids[i] = psg_map.get(seg_p[start], 0) # 不在字典里给0
else:
psg_ids[i] = psg_map.get(Counter(seg_p[start:end+1]).most_common(1)[0][0], 0) # 取众数
batch_psg_ids.append(psg_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_psg_ids = torch.tensor(sequence_padding(batch_psg_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return [batch_token_ids, batch_psg_ids], batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
layer_add_embs = nn.Embedding(len(psg_map)+1, 768)
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0,
layer_add_embs=layer_add_embs)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories))
def forward(self, token_ids, psg_ids):
sequence_output = self.bert([token_ids, psg_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
return emission_score, attention_mask
def predict(self, token_ids, psg_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids, psg_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, psg_ids), label in tqdm(data):
scores = model.predict(token_ids, psg_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 测试两种方案,一种是用数据集来生成crf权重,第二种是来初始化
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 初始化: [valid_f1] token_level: 97.35; entity_level: 96.42
# 固定化: [valid_f1] token_level: 96.92; entity_level: 95.42
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train')
valid_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev')
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=valid_data), batch_size=batch_size, collate_fn=collate_fn)
# 根据训练数据生成权重
transition = np.zeros((len(categories), len(categories)))
start_transition = np.zeros(len(categories))
end_transition = np.zeros(len(categories))
for d in tqdm(train_data, desc='Generate init_trasitions'):
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
for i in range(len(labels)-1):
transition[int(labels[i]), int(labels[i+1])] += 1
start_transition[int(labels[0])] += 1 # start转移到标签
end_transition[int(labels[-1])] += 1 # 标签转移到end
max_v = np.max([np.max(transition), np.max(start_transition), np.max(end_transition)])
min_v = np.min([np.min(transition), np.min(start_transition), np.min(end_transition)])
transition = (transition - min_v) / (max_v - min_v)
start_transition = (start_transition - min_v) / (max_v - min_v)
end_transition = (end_transition - min_v) / (max_v - min_v)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories), init_transitions=[transition, start_transition, end_transition], freeze=True) # 控制是否初始化,是否参加训练
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import time
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = '/workspace/bert-base-chinese/config.json'
checkpoint_path = '/workspace/bert-base-chinese/pytorch_model.bin'
dict_path = '/workspace/bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('/workspace/bert-base-chinese/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/workspace/bert-base-chinese/china-people-daily-ner-corpus/example.test'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
def acc(y_pred, y_true):
y_pred = y_pred[0]
y_pred = torch.argmax(y_pred, dim=-1)
acc = torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
return {'acc': acc}
# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=acc)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
model.save_weights('best_model.pt')
logs["f1"] = f2
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw = open(os.path.join('log/', f'time.txt'), 'a', encoding='utf-8')
# time_fw写入程序开始执行的时间
time_fw.write('Start Time: {:.6f}\n'.format(time.time()))
#evaluator = Evaluator()
#model.fit(train_dataloader, epochs=7, steps_per_epoch=None, callbacks=[evaluator])
# time_fw写入程序开始执行的时间
model.load_weights("best_model.pt")
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
print(f1, precision, recall, f2, precision2, recall2)
time_fw.write('End Time: {:.6f}\n'.format(time.time()))
time_fw.flush()
time_fw.close()
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# efficient_global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 96.55
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import EfficientGlobalPointer
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = EfficientGlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0.5):
X, Y, Z, threshold = 1e-10, 1e-10, 1e-10, 0
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 95.66
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import GlobalPointer
import random
import os
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = GlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0):
X, Y, Z = 0, 1e-10, 1e-10
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment