Commit 66a1d0d0 authored by yangzhong's avatar yangzhong
Browse files

提交初版bert4torch project

parents
#! -*- coding: utf-8 -*-
# 预训练语料构建,这里实现的mlm任务的,NSP和SOP未使用
# 方案:一直动态生成文件,超过最大保存数目时候sleep,
# 当训练速度超过文件生成速度时候,可开启多个数据生成脚本
import numpy as np
from bert4torch.tokenizers import Tokenizer
import json, glob, re
from tqdm import tqdm
import collections
import gc
import shelve
import time
import os
import random
import jieba
jieba.initialize()
class TrainingDataset(object):
"""预训练数据集生成器
"""
def __init__(self, tokenizer, sequence_length=512):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self.tokenizer = tokenizer
self.sequence_length = sequence_length
self.token_pad_id = tokenizer._token_pad_id
self.token_cls_id = tokenizer._token_start_id
self.token_sep_id = tokenizer._token_end_id
self.token_mask_id = tokenizer._token_mask_id
self.vocab_size = tokenizer._vocab_size
def padding(self, sequence, padding_value=None):
"""对单个序列进行补0
"""
if padding_value is None:
padding_value = self.token_pad_id
sequence = sequence[:self.sequence_length]
padding_length = self.sequence_length - len(sequence)
return sequence + [padding_value] * padding_length
def sentence_process(self, text):
"""单个文本的处理函数,返回处理后的instance
"""
raise NotImplementedError
def paragraph_process(self, texts, starts, ends, paddings):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances, instance = [], [[start] for start in starts]
for text in texts:
# 处理单个句子
sub_instance = self.sentence_process(text)
sub_instance = [i[:self.sequence_length - 2] for i in sub_instance]
new_length = len(instance[0]) + len(sub_instance[0])
# 如果长度即将溢出
if new_length > self.sequence_length - 1:
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储结果,并构建新样本
instances.append(complete_instance)
instance = [[start] for start in starts]
# 样本续接
for item, sub_item in zip(instance, sub_instance):
item.extend(sub_item)
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储最后的instance
instances.append(complete_instance)
return instances
def serialize(self, instances, db, count):
"""写入到文件
"""
for instance in instances:
input_ids, masked_lm_labels = instance[0], instance[1]
assert len(input_ids) <= sequence_length
features = collections.OrderedDict()
features["input_ids"] = input_ids
features["masked_lm_labels"] = masked_lm_labels
db[str(count)] = features
count += 1
return count
def process(self, corpus, record_name):
"""处理输入语料(corpus)
"""
count = 0
db = shelve.open(record_name)
for texts in corpus:
instances = self.paragraph_process(texts)
count = self.serialize(instances, db, count)
db.close()
del instances
gc.collect()
# 记录对应的文件名和样本量
record_info = {"filename": record_name, "samples_num": count}
json.dump(record_info, open(record_name + ".json", "w", encoding="utf-8"))
print('write %s examples into %s' % (count, record_name))
class TrainingDatasetRoBERTa(TrainingDataset):
"""预训练数据集生成器(RoBERTa模式)
"""
def __init__(self, tokenizer, word_segment, mask_rate=0.15, sequence_length=512):
"""参数说明:
tokenizer必须是bert4torch自带的tokenizer类;
word_segment是任意分词函数。
"""
super(TrainingDatasetRoBERTa, self).__init__(tokenizer, sequence_length)
self.word_segment = word_segment
self.mask_rate = mask_rate
def token_process(self, token_id):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand = np.random.random()
if rand <= 0.8:
return self.token_mask_id
elif rand <= 0.9:
return token_id
else:
return np.random.randint(0, self.vocab_size)
def sentence_process(self, text):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列, 来指定哪些token是否要被mask
"""
words = self.word_segment(text)
rands = np.random.random(len(words))
token_ids, mask_ids = [], []
for rand, word in zip(rands, words):
word_tokens = self.tokenizer.tokenize(text=word)[1:-1]
word_token_ids = self.tokenizer.tokens_to_ids(word_tokens)
if rand < self.mask_rate:
word_mask_ids = [self.token_process(i) for i in word_token_ids]
token_ids.extend(word_mask_ids)
mask_ids.extend(word_token_ids)
else:
token_ids.extend(word_token_ids)
word_mask_ids = [0] * len(word_tokens)
mask_ids.extend(word_mask_ids)
return [token_ids, mask_ids]
def paragraph_process(self, texts):
"""给原方法补上starts、ends、paddings
"""
starts = [self.token_cls_id, 0]
ends = [self.token_sep_id, 0]
paddings = [self.token_pad_id, 0]
return super(TrainingDatasetRoBERTa, self).paragraph_process(texts, starts, ends, paddings)
if __name__ == '__main__':
sequence_length = 512 # 文本长度
max_file_num = 40 # 最大保存的文件个数
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt' # 字典文件
dir_training_data = 'E:/Github/bert4torch/examples/datasets/pretrain' # 保存的文件目录
dir_corpus = 'F:/Projects/data/corpus/pretrain' # 读入的语料地址
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def some_texts():
'''挑选语料
'''
files_corpus = glob.glob(f'{dir_corpus}/*/*') # 根据目录结构自行调整
file_corpus = random.choice(files_corpus) # 随机挑选一篇文章
count, texts = 0, []
with open(file_corpus, encoding='utf-8') as f:
for l in tqdm(f, desc=f'Load data from {file_corpus}'):
l = l.strip()
texts.extend(re.findall(u'.*?[\n。]+', l))
count += 1
if count == 10: # 10篇文章合在一起再处理
yield texts
count, texts = 0, []
if texts:
yield texts
def word_segment(text):
return jieba.lcut(text)
TD = TrainingDatasetRoBERTa(tokenizer, word_segment, sequence_length=sequence_length)
while True:
train_files = [file for file in os.listdir(dir_training_data) if ('train_' in file) and ('dat' in file)]
# 当保存的训练文件未达到指定数量时
if len(train_files) < max_file_num:
record_name = f'{dir_training_data}/train_'+ time.strftime('%Y%m%d%H%M%S', time.localtime())
TD.process(corpus=some_texts(), record_name=record_name)
time.sleep(1) # 可不加,这里是防止生成文件名一样
else:
time.sleep(300)
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage1,训练方式和simbert类似+[MASK预测]
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, AutoRegressiveDecoder
from bert4torch.snippets import Callback, truncate_sequences, get_pool_emb
from bert4torch.tokenizers import Tokenizer
import jieba
jieba.initialize()
# 基本信息
maxlen = 64
batch_size = 12
# bert配置,加载roformer权重
config_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 这里语料没有官方的丰富,可用自定义预料
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def masked_encode(text):
"""wwm随机mask
"""
words = jieba.lcut(text)
rands = np.random.random(len(words))
source, target = [tokenizer._token_start_id], [0]
for r, w in zip(rands, words):
ids = tokenizer.encode(w)[0][1:-1]
if r < 0.15 * 0.8:
source.extend([tokenizer._token_mask_id] * len(ids))
target.extend(ids)
elif r < 0.15 * 0.9:
source.extend(ids)
target.extend(ids)
elif r < 0.15:
source.extend(np.random.choice(tokenizer._vocab_size - 1, size=len(ids)) + 1)
target.extend(ids)
else:
source.extend(ids)
target.extend([0] * len(ids))
source = source[:maxlen - 1] + [tokenizer._token_end_id]
target = target[:maxlen - 1] + [0]
return source, target
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
for _ in range(2):
text, synonym = synonyms[:2]
if np.random.random() < 0.5:
text_ids = masked_encode(text)[0]
else:
text_ids = tokenizer.encode(text)[0]
synonym_ids = tokenizer.encode(synonym)[0][1:]
truncate_sequences(maxlen * 2, -2, text_ids, synonym_ids)
token_ids = text_ids + synonym_ids
segment_ids = [0] * len(text_ids) + [1] * len(synonym_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
text, synonym = synonym, text
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', with_mlm=True, dropout_rate=0.2, application='unilm')
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred):
y_true = self.get_labels_of_similarity(y_pred) # 构建标签
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
similarities = similarities - torch.eye(y_pred.shape[0], device=device) * 1e12 # 排除对角线
similarities = similarities * 30 # scale
loss = F.cross_entropy(similarities, y_true)
return loss
def get_labels_of_similarity(self, y_pred):
idxs = torch.arange(0, y_pred.shape[0], device=device)
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
labels = idxs_1.eq(idxs_2).float()
return labels
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage2,把simbert的相似度蒸馏到roformer-sim上
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, get_pool_emb
from bert4torch.snippets import AutoRegressiveDecoder, Callback, truncate_sequences
from bert4torch.tokenizers import Tokenizer
import jieba
jieba.initialize()
# 基本信息
maxlen = 64
batch_size = 12
# bert配置,需要加载stage1训练后的权重,这里直接加载官方最终的权重以示例
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 这里语料和stage1保持一致
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def masked_encode(text):
"""wwm随机mask
"""
words = jieba.lcut(text)
rands = np.random.random(len(words))
source, target = [tokenizer._token_start_id], [0]
for r, w in zip(rands, words):
ids = tokenizer.encode(w)[0][1:-1]
if r < 0.15 * 0.8:
source.extend([tokenizer._token_mask_id] * len(ids))
target.extend(ids)
elif r < 0.15 * 0.9:
source.extend(ids)
target.extend(ids)
elif r < 0.15:
source.extend(
np.random.choice(tokenizer._vocab_size - 1, size=len(ids)) + 1
)
target.extend(ids)
else:
source.extend(ids)
target.extend([0] * len(ids))
source = source[:maxlen - 1] + [tokenizer._token_end_id]
target = target[:maxlen - 1] + [0]
return source, target
# ========== 蒸馏用:开始 ==========
# simbert配置
sim_config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
sim_checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
sim_dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
# 建立分词器
sim_tokenizer = Tokenizer(sim_dict_path, do_lower_case=True) # 建立分词器
# 建立加载模型
simbert = build_transformer_model(sim_config_path, sim_checkpoint_path, with_pool='linear', application='unilm').to(device)
# ========== 蒸馏用:结束 ==========
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
batch_sim_token_ids, batch_sim_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
for _ in range(2):
text, synonym = synonyms[:2]
if np.random.random() < 0.5:
text_ids = masked_encode(text)[0]
else:
text_ids = tokenizer.encode(text)[0]
synonym_ids = tokenizer.encode(synonym)[0][1:]
truncate_sequences(maxlen * 2, -2, text_ids, synonym_ids)
token_ids = text_ids + synonym_ids
segment_ids = [0] * len(text_ids) + [1] * len(synonym_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
# ==== 蒸馏用:开始 ====
token_ids, segment_ids = sim_tokenizer.encode(text, maxlen=maxlen)
batch_sim_token_ids.append(token_ids)
batch_sim_segment_ids.append(segment_ids)
# ==== 蒸馏用:结束 ====
text, synonym = synonym, text
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
# ==== 蒸馏用:开始 ====
batch_sim_token_ids = torch.tensor(sequence_padding(batch_sim_token_ids), dtype=torch.long, device=device)
batch_sim_segment_ids = torch.tensor(sequence_padding(batch_sim_segment_ids), dtype=torch.long, device=device)
sim_vecs = simbert.predict([batch_sim_token_ids, batch_sim_segment_ids])[1]
sim_vecs /= (sim_vecs**2).sum(dim=-1, keepdims=True)**0.5
sims = torch.matmul(sim_vecs, sim_vecs.T)
# ==== 蒸馏用:结束 ====
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids, sims]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', with_mlm=True, dropout_rate=0.2, application='unilm')
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask, sims = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb, sims)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred, y_true):
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
loss = 100 * torch.mean((similarities - y_true) ** 2)
return loss
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# SimBERT_v2监督训练代码supervised部分
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate
from bert4torch.snippets import Callback, truncate_sequences, get_pool_emb
from bert4torch.tokenizers import Tokenizer
import json
import glob
# 基本信息
maxlen = 64
batch_size = 12
labels = ['contradiction', 'entailment', 'neutral']
# bert配置,需要加载stage2训练后的权重,这里直接加载官方最终的权重以示例
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def split(text):
"""分割句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen * 1.2, seps, strips)
class MyDataset(ListDataset):
def load_data(self, file_path):
dataset1_path, dataset2_path = file_path
D1 = self.load_data_1(dataset1_path)
D2 = self.load_data_2(dataset2_path)
return D1 + D2
@staticmethod
def load_data_1(filenames, threshold=0.5):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) != 3:
continue
l[0], l[1] = split(l[0])[0], split(l[1])[0]
D.append((l[0], l[1], int(float(l[2]) > threshold)))
return D
@staticmethod
def load_data_2(dir_path):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in glob.glob(dir_path):
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['gold_label'] not in labels:
continue
text1 = split(l['sentence1'])[0]
text2 = split(l['sentence2'])[0]
label = labels.index(l['gold_label']) + 2
D.append((text1, text2, label))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text1, text2, label in batch:
for text in [text1, text2]:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels
# 加载数据集
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
dataset1_path = []
for task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']:
for f in ['train', 'valid']:
threshold = 2.5 if task_name == 'STS-B' else 0.5
filename = '%s%s/%s.%s.data' % (data_path, task_name, task_name, f)
dataset1_path.append(filename)
dataset2_path = 'F:/Projects/data/corpus/sentence_embedding/XNLI-MT-1.0/cnsd/cnsd-*/*.jsonl'
train_dataloader = DataLoader(MyDataset([dataset1_path, dataset2_path]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', dropout_rate=0.2)
self.pool_method = pool_method
self.dense = nn.Linear(768*3, 5, bias=False)
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method) # [btz*2, hdsz]
# 向量合并:a、b、|a-b|拼接
u, v = sen_emb[::2], sen_emb[1::2]
sen_emb_concat = torch.cat([u, v, torch.abs(u-v)], dim=-1) # [btz, hdsz*3]
y_pred = self.dense(sen_emb_concat) # [btz, 5]
return y_pred
model = Model(pool_method='cls').to(device)
class MyLoss(nn.Module):
"""loss分
"""
def __init__(self) -> None:
super().__init__()
self.mask = torch.tensor([0,0,1,1,1], device=device)
def forward(self, y_pred, y_true):
'''如果是两分类数据,则把后三位置-inf,如果是三分类数据,把前两位置-inf
'''
task = (y_true < 1.5).long()
y_pred_1 = y_pred - self.mask * 1e12
y_pred_2 = y_pred - (1-self.mask) * 1e12
y_pred = task * y_pred_1 + (1-task) * y_pred_2
return F.cross_entropy(y_pred, y_true.flatten())
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于“半指针-半标注”结构
# 文章介绍:https://kexue.fm/archives/7161
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
import numpy as np
from bert4torch.layers import LayerNorm
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn as nn
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def collate_fn(batch):
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
batch_token_ids, batch_segment_ids = [], []
batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
for d in batch:
token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
# 整理三元组 {s: [(o, p)]}
spoes = {}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
s_idx = search(s, token_ids)
o_idx = search(o, token_ids)
if s_idx != -1 and o_idx != -1:
s = (s_idx, s_idx + len(s) - 1)
o = (o_idx, o_idx + len(o) - 1, p)
if s not in spoes:
spoes[s] = []
spoes[s].append(o)
if spoes:
# subject标签
subject_labels = np.zeros((len(token_ids), 2))
for s in spoes:
subject_labels[s[0], 0] = 1 # subject首
subject_labels[s[1], 1] = 1 # subject尾
# 随机选一个subject(这里没有实现错误!这就是想要的效果!!)
# Todo: 感觉可以对未选到的subject加个mask,这样计算loss就不会计算到,可能因为模型对prob**n正例加权重导致影响不大
start, end = np.array(list(spoes.keys())).T
start = np.random.choice(start)
end = np.random.choice(end[end >= start])
subject_ids = (start, end)
# 对应的object标签
object_labels = np.zeros((len(token_ids), len(predicate2id), 2))
for o in spoes.get(subject_ids, []):
object_labels[o[0], o[2], 0] = 1
object_labels[o[1], o[2], 1] = 1
# 构建batch
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_subject_labels.append(subject_labels)
batch_subject_ids.append(subject_ids)
batch_object_labels.append(object_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_subject_labels = torch.tensor(sequence_padding(batch_subject_labels), dtype=torch.float, device=device)
batch_subject_ids = torch.tensor(batch_subject_ids, dtype=torch.long, device=device)
batch_object_labels = torch.tensor(sequence_padding(batch_object_labels), dtype=torch.float, device=device)
batch_attention_mask = (batch_token_ids != tokenizer._token_pad_id)
return [batch_token_ids, batch_segment_ids, batch_subject_ids], [batch_subject_labels, batch_object_labels, batch_attention_mask]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path)
self.linear1 = nn.Linear(768, 2)
self.condLayerNorm = LayerNorm(hidden_size=768, conditional_size=768*2)
self.linear2 = nn.Linear(768, len(predicate2id)*2)
@staticmethod
def extract_subject(inputs):
"""根据subject_ids从output中取出subject的向量表征
"""
output, subject_ids = inputs
start = torch.gather(output, dim=1, index=subject_ids[:, :1].unsqueeze(2).expand(-1, -1, output.shape[-1]))
end = torch.gather(output, dim=1, index=subject_ids[:, 1:].unsqueeze(2).expand(-1, -1, output.shape[-1]))
subject = torch.cat([start, end], 2)
return subject[:, 0]
def forward(self, inputs):
# 预测subject
seq_output = self.bert(inputs[:2]) # [btz, seq_len, hdsz]
subject_preds = (torch.sigmoid(self.linear1(seq_output)))**2 # [btz, seq_len, 2]
# 传入subject,预测object
# 通过Conditional Layer Normalization将subject融入到object的预测中
subject_ids = inputs[2]
# 理论上应该用LayerNorm前的,但是这样只能返回各个block顶层输出,这里和keras实现不一致
subject = self.extract_subject([seq_output, subject_ids])
output = self.condLayerNorm([seq_output, subject])
output = (torch.sigmoid(self.linear2(output)))**4
object_preds = output.reshape(*output.shape[:2], len(predicate2id), 2)
return [subject_preds, object_preds]
def predict_subject(self, inputs):
self.eval()
with torch.no_grad():
seq_output = self.bert(inputs[:2]) # [btz, seq_len, hdsz]
subject_preds = (torch.sigmoid(self.linear1(seq_output)))**2 # [btz, seq_len, 2]
return [seq_output, subject_preds]
def predict_object(self, inputs):
self.eval()
with torch.no_grad():
seq_output, subject_ids = inputs
subject = self.extract_subject([seq_output, subject_ids])
output = self.condLayerNorm([seq_output, subject])
output = (torch.sigmoid(self.linear2(output)))**4
object_preds = output.reshape(*output.shape[:2], len(predicate2id), 2)
return object_preds
train_model = Model().to(device)
class BCELoss(nn.BCELoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, inputs, targets):
subject_preds, object_preds = inputs
subject_labels, object_labels, mask = targets
# sujuect部分loss
subject_loss = super().forward(subject_preds, subject_labels)
subject_loss = subject_loss.mean(dim=-1)
subject_loss = (subject_loss * mask).sum() / mask.sum()
# object部分loss
object_loss = super().forward(object_preds, object_labels)
object_loss = object_loss.mean(dim=-1).sum(dim=-1)
object_loss = (object_loss * mask).sum() / mask.sum()
return subject_loss + object_loss
train_model.compile(loss=BCELoss(reduction='none'), optimizer=optim.Adam(train_model.parameters(), 1e-5))
def extract_spoes(text):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=device)
# 抽取subject
seq_output, subject_preds = train_model.predict_subject([token_ids, segment_ids])
subject_preds[:, [0, -1]] *= 0 # 首cls, 尾sep置为0
start = torch.where(subject_preds[0, :, 0] > 0.6)[0]
end = torch.where(subject_preds[0, :, 1] > 0.5)[0]
subjects = []
for i in start:
j = end[end >= i]
if len(j) > 0:
j = j[0]
subjects.append((i.item(), j.item()))
if subjects:
spoes = []
# token_ids = token_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
# segment_ids = segment_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
seq_output = seq_output.repeat([len(subjects)]+[1]*(len(seq_output.shape)-1))
subjects = torch.tensor(subjects, dtype=torch.long, device=device)
# 传入subject,抽取object和predicate
object_preds = train_model.predict_object([seq_output, subjects])
object_preds[:, [0, -1]] *= 0
for subject, object_pred in zip(subjects, object_preds):
start = torch.where(object_pred[:, :, 0] > 0.6)
end = torch.where(object_pred[:, :, 1] > 0.5)
for _start, predicate1 in zip(*start):
for _end, predicate2 in zip(*end):
if _start <= _end and predicate1 == predicate2:
spoes.append(
((mapping[subject[0]][0],
mapping[subject[1]][-1]), predicate1.item(),
(mapping[_start][0], mapping[_end][-1]))
)
break
return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1])
for s, p, o, in spoes]
else:
return []
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (
tuple(tokenizer.tokenize(spo[0])),
spo[1],
tuple(tokenizer.tokenize(spo[2])),
)
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
R = set([SPO(spo) for spo in extract_spoes(d['text'])])
T = set([SPO(spo) for spo in d['spo_list']])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
pbar.update()
pbar.set_description(
'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)
)
s = json.dumps({
'text': d['text'],
'spo_list': list(T),
'spo_list_pred': list(R),
'new': list(R - T),
'lack': list(T - R),
},
ensure_ascii=False,
indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
# optimizer.apply_ema_weights()
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# train_model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
if __name__ == '__main__':
evaluator = Evaluator()
train_model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
train_model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于GlobalPointer的仿TPLinker设计
# 文章介绍:https://kexue.fm/archives/8888
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.layers import GlobalPointer
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.losses import SparseMultilabelCategoricalCrossentropy
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def collate_fn(batch):
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
batch_token_ids, batch_segment_ids = [], []
batch_entity_labels, batch_head_labels, batch_tail_labels = [], [], []
for d in batch:
token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
# 整理三元组 {s: [(o, p)]}
spoes = set()
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids)
oh = search(o, token_ids)
if sh != -1 and oh != -1:
spoes.add((sh, sh + len(s) - 1, p, oh, oh + len(o) - 1))
# 构建标签
entity_labels = [set() for _ in range(2)]
head_labels = [set() for _ in range(len(predicate2id))]
tail_labels = [set() for _ in range(len(predicate2id))]
for sh, st, p, oh, ot in spoes:
entity_labels[0].add((sh, st))
entity_labels[1].add((oh, ot))
head_labels[p].add((sh, oh))
tail_labels[p].add((st, ot))
for label in entity_labels + head_labels + tail_labels:
if not label: # 至少要有一个标签
label.add((0, 0)) # 如果没有则用0填充
entity_labels = sequence_padding([list(l) for l in entity_labels]) # [subject/object=2, 实体个数, 实体起终点]
head_labels = sequence_padding([list(l) for l in head_labels]) # [关系个数, 该关系下subject/object配对数, subject/object起点]
tail_labels = sequence_padding([list(l) for l in tail_labels]) # [关系个数, 该关系下subject/object配对数, subject/object终点]
# 构建batch
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_entity_labels.append(entity_labels)
batch_head_labels.append(head_labels)
batch_tail_labels.append(tail_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
# batch_entity_labels: [btz, subject/object=2, 实体个数, 实体起终点]
# batch_head_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object起点]
# batch_tail_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object终点]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, seq_dims=2), dtype=torch.float, device=device)
batch_head_labels = torch.tensor(sequence_padding(batch_head_labels, seq_dims=2), dtype=torch.float, device=device)
batch_tail_labels = torch.tensor(sequence_padding(batch_tail_labels, seq_dims=2), dtype=torch.float, device=device)
return [batch_token_ids, batch_segment_ids], [batch_entity_labels, batch_head_labels, batch_tail_labels]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path)
self.entity_output = GlobalPointer(hidden_size=768, heads=2, head_size=64)
self.head_output = GlobalPointer(hidden_size=768, heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False)
self.tail_output = GlobalPointer(hidden_size=768, heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False)
def forward(self, inputs):
hidden_states = self.bert(inputs) # [btz, seq_len, hdsz]
mask = inputs[0].gt(0).long()
entity_output = self.entity_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
head_output = self.head_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
tail_output = self.tail_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
return entity_output, head_output, tail_output
model = Model().to(device)
class MyLoss(SparseMultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_trues):
''' y_preds: [Tensor], shape为[btz, heads, seq_len ,seq_len]
'''
loss_list = []
for y_pred, y_true in zip(y_preds, y_trues):
shape = y_pred.shape
# 乘以seq_len是因为(i, j)在展开到seq_len*seq_len维度对应的下标是i*seq_len+j
y_true = y_true[..., 0] * shape[2] + y_true[..., 1] # [btz, heads, 实体起终点的下标]
y_pred = y_pred.reshape(shape[0], -1, np.prod(shape[2:])) # [btz, heads, seq_len*seq_len]
loss = super().forward(y_pred, y_true.long())
loss = torch.mean(torch.sum(loss, dim=1))
loss_list.append(loss)
return {'loss': sum(loss_list)/3, 'entity_loss': loss_list[0], 'head_loss': loss_list[1], 'tail_loss': loss_list[2]}
model.compile(loss=MyLoss(mask_zero=True), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['entity_loss', 'head_loss', 'tail_loss'])
def extract_spoes(text, threshold=0):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=device)
outputs = model.predict([token_ids, segment_ids])
outputs = [o[0].cpu().numpy() for o in outputs] # [heads, seq_len, seq_len]
# 抽取subject和object
subjects, objects = set(), set()
outputs[0][:, [0, -1]] -= float('inf')
outputs[0][:, :, [0, -1]] -= float('inf')
for l, h, t in zip(*np.where(outputs[0] > threshold)):
if l == 0:
subjects.add((h, t))
else:
objects.add((h, t))
# 识别对应的predicate
spoes = set()
for sh, st in subjects:
for oh, ot in objects:
p1s = np.where(outputs[1][:, sh, oh] > threshold)[0]
p2s = np.where(outputs[2][:, st, ot] > threshold)[0]
ps = set(p1s) & set(p2s)
for p in ps:
spoes.add((
text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[p],
text[mapping[oh][0]:mapping[ot][-1] + 1]
))
return list(spoes)
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
R = set([SPO(spo) for spo in extract_spoes(d['text'])])
T = set([SPO(spo) for spo in d['spo_list']])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
# optimizer.apply_ema_weights()
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %(f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker, cat方式实体部分收敛较快,关系部分收敛较慢
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.layers import TplinkerHandshakingKernel
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
maxlen = 50
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_entity_labels: [btz, pair_len]
# batch_head_labels: [btz, rel_size, pair_len]
# batch_tail_labels: [btz, rel_size, pair_len]
batch_entity_labels = torch.zeros((len(batch), pair_len), dtype=torch.long, device=device)
batch_head_labels = torch.zeros((len(batch), len(predicate2id), pair_len), dtype=torch.long, device=device)
batch_tail_labels = torch.zeros((len(batch), len(predicate2id), pair_len), dtype=torch.long, device=device)
batch_token_ids = []
for i, d in enumerate(batch):
token_ids = tokenizer.encode(d['text'])[0][1:-1][:maxlen] # 这里要限制取前max_len个
batch_token_ids.append(token_ids)
# 整理三元组 {s: [(o, p)]}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids) # 这里超过长度就会找不到
oh = search(o, token_ids)
if sh != -1 and oh != -1:
st, ot = sh+len(s)-1, oh+len(o)-1
batch_entity_labels[i, map_ij2k[sh, st]] = 1
batch_entity_labels[i, map_ij2k[oh, ot]] = 1
if sh <= oh:
batch_head_labels[i, p, map_ij2k[sh, oh]] = 1
else:
batch_head_labels[i, p, map_ij2k[oh, sh]] = 2
if st <= ot:
batch_tail_labels[i, p, map_ij2k[st, ot]] = 1
else:
batch_tail_labels[i, p, map_ij2k[ot, st]] = 2
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], [batch_entity_labels, batch_head_labels, batch_tail_labels]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0)
self.combine_fc = nn.Linear(768*2, 768)
self.ent_fc = nn.Linear(768, 2)
self.head_rel_fc = nn.Linear(768, len(predicate2id)*3)
self.tail_rel_fc = nn.Linear(768, len(predicate2id)*3)
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cat')
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state) # [btz, pair_len, hdsz]
ent_shaking_outputs = self.ent_fc(shaking_hiddens) # [btz, pair_len, 2]
btz, pair_len = shaking_hiddens.shape[:2]
head_rel_shaking_outputs = self.head_rel_fc(shaking_hiddens).reshape(btz, -1, pair_len, 3) #[btz, predicate_num, pair_len, 3]
tail_rel_shaking_outputs = self.tail_rel_fc(shaking_hiddens).reshape(btz, -1, pair_len, 3)
return ent_shaking_outputs, head_rel_shaking_outputs, tail_rel_shaking_outputs
model = Model().to(device)
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_trues):
loss_list = []
for y_pred, y_true in zip(y_preds, y_trues):
loss = super().forward(y_pred.view(-1, y_pred.size()[-1]), y_true.view(-1))
loss_list.append(loss)
z = (2 * len(predicate2id) + 1)
total_steps = 6000 # 前期实体识别的权重高一些,建议也可以设置为model.total_steps
w_ent = max(1 / z + 1 - model.global_step / total_steps, 1 / z)
w_rel = min((len(predicate2id) / z) * model.global_step / total_steps, (len(predicate2id) / z))
loss = w_ent*loss_list[0] + w_rel*loss_list[1] + w_rel*loss_list[2]
return {'loss': loss, 'entity_loss': loss_list[0], 'head_loss': loss_list[1], 'tail_loss': loss_list[2]}
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), 5e-5), metrics=['entity_loss', 'head_loss', 'tail_loss'])
def extract_spoes(text):
"""抽取输入text所包含的三元组
"""
def get_spots_fr_shaking_tag(shaking_tag):
'''解析关系
'''
spots = []
for shaking_inds in shaking_tag.nonzero():
rel_id = shaking_inds[0].item()
tag_id = shaking_tag[rel_id][shaking_inds[1]].item()
matrix_inds = map_k2ij[shaking_inds[1].item()]
# 保证前面是subject,后面是object
if tag_id == 1:
spot = (rel_id, matrix_inds[0], matrix_inds[1])
elif tag_id == 2:
spot = (rel_id, matrix_inds[1], matrix_inds[0])
spots.append(spot)
return spots
tokens = tokenizer.tokenize(text)[1:-1]
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.encode(text)[0][1:-1]
token_ids_ts = torch.tensor(sequence_padding([token_ids], length=maxlen), dtype=torch.long, device=device)
outputs = model.predict([token_ids_ts])
outputs = [o[0].argmax(dim=-1) for o in outputs]
# 抽取entity
ent_matrix_spots = set()
ent_text = set()
for shaking_ind in outputs[0].nonzero():
shaking_ind_ = shaking_ind[0].item()
# tag_id = outputs[0][shaking_ind_]
matrix_inds = map_k2ij[shaking_ind_]
spot = (matrix_inds[0], matrix_inds[1])
if (spot[0] < len(mapping)) and (spot[1] < len(mapping)): # 实体起始在mapping范围内
ent_matrix_spots.add(spot)
ent_text.add(text[mapping[spot[0]][0]:mapping[spot[1]][-1] + 1])
# 识别对应的predicate
head_rel_matrix_spots = get_spots_fr_shaking_tag(outputs[1])
tail_rel_matrix_spots = get_spots_fr_shaking_tag(outputs[2])
spoes = []
for rel_h, sh, oh in head_rel_matrix_spots:
for rel_t, st, ot in tail_rel_matrix_spots:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if (rel_h == rel_t) and ((sh, st) in ent_matrix_spots) and ((oh, ot) in ent_matrix_spots):
spoes.append((text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[rel_h], text[mapping[oh][0]:mapping[ot][-1] + 1]))
return spoes, token_ids, ent_text
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
E1, E2 = 0, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
spoes, token_ids, ent_text_pred = extract_spoes(d['text'])
# spo_list是用来根据maxlen删减的
spo_list = []
for s, p, o in d['spo_list']:
s_ = tokenizer.encode(s)[0][1:-1]
o_ = tokenizer.encode(o)[0][1:-1]
sh = search(s_, token_ids) # 这里超过长度就会找不到
oh = search(o_, token_ids)
if sh != -1 and oh != -1:
spo_list.append((s, p, o))
# 计算三元组的f1值
R = set([SPO(spo) for spo in spoes])
T = set([SPO(spo) for spo in spo_list])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
# 计算实体的指标
ent_text_truth = set([spo[0] for spo in spo_list] + [spo[-1] for spo in spo_list])
E1 += len(ent_text_pred & ent_text_truth)
E2 += len(ent_text_truth)
E_acc = E1 / E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f' % (f1, precision, recall, E_acc))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker_plus
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import TplinkerHandshakingKernel
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
maxlen = 50
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def tran_ent_rel2id():
'''获取最后一个分类层的的映射关系
'''
tag2id = {'ent': 0}
for p in predicate2id.keys():
for mode in ['sh_oh', 'oh_sh', 'st_ot', 'ot_st']:
tag2id[p+'##'+mode] = len(tag2id)
return tag2id
tag2id = tran_ent_rel2id()
id2tag = {v: k for k, v in tag2id.items()}
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels = torch.zeros((len(batch), pair_len, len(tag2id)), dtype=torch.long, device=device)
batch_token_ids = []
for i, d in enumerate(batch):
token_ids = tokenizer.encode(d['text'])[0][1:-1][:maxlen] # 这里要限制取前max_len个
batch_token_ids.append(token_ids)
# 整理三元组 {s: [(o, p)]}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids)
oh = search(o, token_ids)
if sh != -1 and oh != -1:
st, ot = sh+len(s)-1, oh+len(o)-1
batch_labels[i, map_ij2k[sh, st], tag2id['ent']] = 1
batch_labels[i, map_ij2k[oh, ot], tag2id['ent']] = 1
if sh <= oh:
batch_labels[i, map_ij2k[sh, oh], tag2id[p+'##sh_oh']] = 1
else:
batch_labels[i, map_ij2k[oh, sh], tag2id[p+'##oh_sh']] = 1
if st <= ot:
batch_labels[i, map_ij2k[st, ot], tag2id[p+'##st_ot']] = 1
else:
batch_labels[i, map_ij2k[ot, st], tag2id[p+'##ot_st']] = 1
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0)
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cln_plus', inner_enc_type='lstm')
self.fc = nn.Linear(768, len(tag2id))
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state)
output = self.fc(shaking_hiddens) # [btz, pair_len, tag_size]
return output
model = Model().to(device)
model.compile(loss=MultilabelCategoricalCrossentropy(), optimizer=optim.Adam(model.parameters(), 5e-5))
def extract_spoes(text, threshold=0):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text)[1:-1]
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.encode(text)[0][1:-1]
token_ids_ = torch.tensor(sequence_padding([token_ids], length=maxlen), dtype=torch.long, device=device)
outputs = model.predict([token_ids_])[0].cpu().numpy() # [pair_len, tag_size]
# 抽取entity, 识别对应的predicate
ent_matrix_spots, ent_text = set(), set()
head_rel_matrix_spots, tail_rel_matrix_spots = [], []
for shaking_ind, tag_id in zip(*np.where(outputs > threshold)):
matrix_inds = map_k2ij[shaking_ind]
spot = (matrix_inds[0], matrix_inds[1])
if (spot[0] < len(mapping)) and (spot[1] < len(mapping)): # 实体起始在mapping范围内
p = id2tag[tag_id].split('##')[0]
if id2tag[tag_id] == 'ent':
ent_matrix_spots.add(spot)
ent_text.add(text[mapping[spot[0]][0]:mapping[spot[1]][-1] + 1])
else:
p = predicate2id[p]
if id2tag[tag_id].endswith('##sh_oh'):
head_rel_matrix_spots.append((p, spot[0], spot[1]))
elif id2tag[tag_id].endswith('##oh_sh'):
head_rel_matrix_spots.append((p, spot[1], spot[0]))
elif id2tag[tag_id].endswith('##st_ot'):
tail_rel_matrix_spots.append((p, spot[0], spot[1]))
elif id2tag[tag_id].endswith('##ot_st'):
tail_rel_matrix_spots.append((p, spot[1], spot[0]))
spoes = []
for rel_h, sh, oh in head_rel_matrix_spots:
for rel_t, st, ot in tail_rel_matrix_spots:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if (rel_h == rel_t) and ((sh, st) in ent_matrix_spots) and ((oh, ot) in ent_matrix_spots):
spoes.append((text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[rel_h], text[mapping[oh][0]:mapping[ot][-1] + 1]))
return spoes, token_ids, ent_text
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
E1, E2 = 0, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
spoes, token_ids, ent_text_pred = extract_spoes(d['text'])
# spo_list是用来根据maxlen删减的
spo_list = []
for s, p, o in d['spo_list']:
s_ = tokenizer.encode(s)[0][1:-1]
o_ = tokenizer.encode(o)[0][1:-1]
sh = search(s_, token_ids) # 这里超过长度就会找不到
oh = search(o_, token_ids)
if sh != -1 and oh != -1:
spo_list.append((s, p, o))
# 计算三元组的f1值
R = set([SPO(spo) for spo in spoes])
T = set([SPO(spo) for spo in spo_list])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
# 计算实体的指标
ent_text_truth = set([spo[0] for spo in spo_list] + [spo[-1] for spo in spo_list])
E1 += len(ent_text_pred & ent_text_truth)
E2 += len(ent_text_truth)
E_acc = E1 / E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f' % (f1, precision, recall, E_acc))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# 搜狐基于实体的情感分类
- 比赛链接:https://www.biendata.xyz/competition/sohu_2022/
| 解决方案 | 链接 | 指标 |
| ---- | ---- | ---- |
| Top1 | [知乎](https://zhuanlan.zhihu.com/p/533808475)| 初赛f1=0.7253, 复赛f1=0.8173 |
| baseline | —— | 初赛f1=0.6737 |
# bert4torch复现
- 预训练模型使用xlnet
- 由于比赛结束无法提交,复现只使用线下dev作为对比
- dev为前2000,未使用方案中的后10%作为dev, dev指标略微有点不稳定
| 复现方案 | 方案 | 指标 |
| ---- | ---- | ---- |
| Top1_github | 前2000为dev, 不使用swa, 有warmup, 无label_smoothing, 无fgm, 梯度累积=3, 无rdrop | Epoch 5/10: f1=0.7697|
| Top1_bert4torch复现1 | 参数同上 | Epoch 8/10: f1=0.7556 |
| Top1_bert4torch复现2 | 参数同上+fgm+swa | Epoch 5/10: f1=0.7877 |
| Epoch | Top1_github | Top1_bert4torch复现1 | Top1_bert4torch复现2 |
| ---- | ---- | ---- | ---- |
| 1 | 0.728 | 0.7039 | 0.0274 |
| 2 | 0.7198 | 0.7327 | 0.7180 |
| 3 | 0.747 | 0.7531 | 0.7453 |
| 4 | 0.7625 | 0.7466 | 0.7594 |
| 5 | **0.7697** | 0.7464 | **0.7877** |
| 6 | 0.7638 | 0.7272 | 0.7726 |
| 7 | 0.7415 | 0.7471 | 0.7804 |
| 8 | 0.7593 | **0.7556** | 0.7829 |
| 9 | 0.7477 | 0.7455 | 0.7697 |
| 10 | 0.7466 | 0.7471 | 0.7620 |
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1883\n",
"样本总量: 89195\n",
"================================样本0, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 1, \"content\": \"3.新疆棉是全球业界公认的高品质天然纤维原料,较好满足了全球范围内对棉制纺织品服装的刚性消费需求,是中国乃至全球纺织工业健康可持续发展的重要原料保障。近年来,新疆地区不仅棉花种植生产保持稳定,棉纺织及服装产业也迅速发展,为促进地区经济发展、解决各族人民就业、改善民生福祉发挥了重要作用。新疆棉花种植和棉纺产业是全球纺织供应链的重要组成部分,2021年,新疆棉产量512.9万吨,约占全球棉花产量的20%,美国政府打压新疆棉花及其制品的行为,势必严重危害全球纺织产业供应链的安全,损害全球数千万产业工人的切身利益,对此我们表示强烈反对。4.2021年1月,新疆纺织行业协会发布了详实、客观的《新疆棉纺织行业社会责任报告》,报告以详实的数据和资料充分说明中国新疆维吾尔自治区不存在所谓的“强迫劳动”。我们建议全球纺织业界各相关利益方查阅《报告》的内容和观点,尊重从事实出发的价值观,拒绝虚伪的政治操作,反对恶意造谣。我们欢迎包括美国同业在内的国际品牌、机构实地走访考察新疆棉花产区、纺织服装工厂,独立了解、判断相关事实。我们愿为相关考察和贸易投资合作提供便利与协助。\", \"entity\": {\"美国\": 0, \"中国\": 0}}\\n']\n",
"================================样本1, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 22269, \"content\": \"新华社北京8月27日电美国疾病控制和预防中心日前发布的一项研究结果显示,新冠变异病毒德尔塔毒株成为主要流行毒株后,在美获批的疫苗整体有效性降低约三分之一。研究人员分析了抗疫一线工作人员从2020年12月14日开始的疫苗接种和新冠感染情况。美国多个州的数千名抗疫一线工作人员参加了这项研究,他们每周接受核酸检测。在德尔塔毒株成为主要流行毒株期间,488名没有接种疫苗者中有19人感染,其中有症状感染者的比例为94.7%;2352名完全接种疫苗者中有24人感染,其中有症状感染者的比例为75%。现有研究没有包含感染后的病情严重程度。研究人员分析各种因素后认为,在德尔塔毒株成为主要流行毒株后,美国辉瑞、莫德纳和强生疫苗的整体有效性为66%。而先前发布的数据显示,截至2021年4月10日,这些疫苗的整体有效性为91%。据媒体报道,研究人员计划进一步分析不同疫苗的有效性,以及接种疫苗者和未接种疫苗者被感染后的症状特征等。(完)\", \"entity\": {\"毒株\": 0, \"德尔塔\": 0}}\\n']\n",
"================================样本2, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 44594, \"content\": \"民航局2022年1月21日发布的熔断指令去年底,多班自美国飞往中国的航班推迟或取消,曾引起关注。中国外交部发言人赵立坚在去年12月就达美航空赴华航班中途返航一事回应表示,近日,多班自美国飞往中国的航班推迟或取消,美国航空在距离飞机起飞仅有数小时的情况下突然宣布取消航班,达美航空的航班甚至出现航程过半后返航情况,给中国籍乘客带来巨大损失。中国驻美使领馆积极向有关乘客提供协助,并第一时间向美国有关航司提出严正交涉,敦促其保障乘客正当权益。中国外交部发言人华春莹去年8月表示,众所周知,国际定期客运航班熔断/熔控措施是降低疫情跨境传播风险的重要举措,该措施对中外航空公司一视同仁,公平公开。在中美航线上,中国国内的国航、东航等航空公司都曾熔断过,对于没有触发熔断条件的航空公司,中方从未实施该措施,因此这次美方没有理由限制中国赴美航班客座率,美方做法非常不合理。为何熔断航班激增值得注意的是,早在去年8月,美国交通部就曾要求中国的航空公司在未来四周内,将部分中国赴美航班的客座率限制在40%,当时也是对于美联航被触发“熔断”措施的回应。\", \"entity\": {\"中国\": 0, \"航班\": 0}}\\n']\n",
"================================样本3, train: 66897 dev: 22298 dev_type2: 470\n",
"['{\"id\": 66896, \"content\": \"当地时间11月5日晚,在英国伦敦的“百万面具游行”(Million Mask March)活动过程中,抗议者与警方发生冲突,致8名警察受伤,十余名抗议者被捕。 据英国《卫报》5日报道,当天夜晚,数百名抗议者聚集在英国伦敦,参加一年一度的游行。在游行过程中,参与者抗议政府越权、收入不平等,以及最近新出台的新冠疫情限制措施。 报道称,部分抗议者在游行中与警方发生冲突。伦敦警察厅表示,在伦敦各处的示威活动中,共有12人因各种违法行为被拘捕,此外,已有8名警察在与抗议者的冲突中受伤。 伦敦警察厅还在社交平台发布声明称,“有部分人在议会广场上燃放烟花和爆竹,该行为非常危险。警方为防止民众受到伤害而进入人群。” 据此前报道,“百万面具游行”活动于2011年由一个匿名黑客论坛发起,旨在以游行示威的方式反对审查制度、腐败和战争。\", \"entity\": {\"伦敦\": 0, \"英国\": 0}}\\n']\n"
]
}
],
"source": [
"from sklearn.model_selection import StratifiedKFold\n",
"import json\n",
"with open('E:/Github/Sohu2022/Sohu2022_data/nlp_data/train.txt', 'r', encoding='utf-8') as f:\n",
" train_data = f.readlines()\n",
"tag2_index = []\n",
"for line in train_data:\n",
" line = json.loads(line)\n",
" if 2 in set(line['entity'].values()):\n",
" tag2_index.append(1)\n",
" else:\n",
" tag2_index.append(0)\n",
"print(sum(tag2_index))\n",
" \n",
"print('样本总量:', len(train_data))\n",
"file_id = 0\n",
"kfold = StratifiedKFold(n_splits=4).split(train_data, tag2_index)\n",
"for i, (train_idx, dev_idx) in enumerate(kfold):\n",
" train, dev = [train_data[i] for i in train_idx], [train_data[i] for i in dev_idx]\n",
" dev_type2 = [tag2_index[i] for i in dev_idx]\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/dev_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(dev)\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/train_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(train)\n",
" \n",
" print(f'================================样本{file_id}, train: ', len(train), 'dev: ', len(dev), 'dev_type2: ', sum(dev_type2))\n",
" print(dev[:1])\n",
" file_id += 1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类baseline,https://www.biendata.xyz/competition/sohu_2022/
# 方案:用实体在句子中首次出现的首尾平均池化,fgm + multi_dropout + cv,f1=0.67176
import numpy as np
import random
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.losses import FocalLoss
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
import random
import os
import argparse
import pickle
import warnings
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description='交叉验证')
parser.add_argument('--fileid', default=0)
parser.add_argument('--gpuid', default=0)
args = parser.parse_args()
fileid = args.fileid
gpuid = args.gpuid
# 配置设置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512_cv_{fileid}'
save_path = f'./output/section1{prefix}.txt'
save_path_dev = f'./output/dev{prefix}.txt'
ckpt_path = f'./ckpt/best_model{prefix}.pt'
device = f'cuda:{gpuid}' if torch.cuda.is_available() else 'cpu'
seed = 42
# 模型设置
epochs = 10
steps_per_epoch = 1000
total_eval_step = None
maxlen = 512
batch_size = 7
batch_size_eval = 64
categories = [-2, -1, 0, 1, 2]
categories_count = {k+1:0 for k in range(len(categories))}
# 固定seed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# 加载数据集
class MyDataset(ListDataset):
def __init__(self, file_path=None, data=None, mode='train'):
self.mode = mode
super().__init__(file_path, data)
def load_data(self, filename):
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
with open(filename, encoding='utf-8') as f:
for l in tqdm(f, desc=f'[Loading {self.mode} data]'):
taskData = json.loads(l.strip())
id = taskData['id']
# 按照最长长度和标点符号切分
for t in text_segmentate(taskData['content'], maxlen - 2, seps, strips):
entitys = []
# train
if isinstance(taskData['entity'], dict):
for ent, label in taskData['entity'].items():
start = self.search(ent, t)
if start != -1:
label = categories.index(label)+1
entitys.append((ent, start, start+len(ent)-1, label)) # +1是为了padding
categories_count[label] += 1
# test
elif isinstance(taskData['entity'], list):
for ent in taskData['entity']:
start = self.search(ent, t)
if start != -1:
entitys.append((ent, start, start+len(ent)-1, 0))
if entitys: # 如果存在实体
D.append((id, t, *entitys))
return D
def search(self, pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_extra, batch_token_ids, batch_entity_ids, batch_entity_labels = [], [], [], []
for d in batch:
id, contents, entities = d[0], d[1], d[2:]
tokens = tokenizer.tokenize(contents, maxlen=maxlen)[1:-1]
tokens = ['[CLS]'] + [j for i in tokens for j in i] + ['[SEP]'] # 转成char为单位的
mapping = tokenizer.rematch(contents, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
entity_ids, entity_labels, extra_map = [], [], {}
for ent, start, end, label in entities:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
entity_ids.append([start, end])
# # 验证边界id没有问题
# if ''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])) != ent.lower():
# print(''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])), ent)
entity_labels.append(label)
extra_map[(start, end)] = (ent, label)
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
entity_labels.append(0)
batch_extra.append((id, extra_map))
batch_token_ids.append(token_ids)
batch_entity_ids.append(entity_ids)
batch_entity_labels.append(entity_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids, batch_extra], batch_entity_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset(f'{data_dir}/train_{fileid}.txt'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'{data_dir}/dev_{fileid}.txt', mode='dev'), batch_size=batch_size_eval, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'{data_dir}/test.txt', mode='test'), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dropout = [nn.Dropout(0.1), nn.Dropout(0.3), nn.Dropout(0.5), nn.Dropout(0.7)]
self.dense = nn.Linear(768, 5+1) # 包含padding
def forward(self, inputs):
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
btz, entity_count, _ = entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logits = []
for dropout in self.dropout:
entity_logits.append(self.dense(dropout(entity_states)))
return entity_logits
model = Model().to(device)
print(categories_count)
class Loss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.loss_fn = FocalLoss(ignore_index=0)
def forward(self, entity_logits, labels):
loss = 0
for entity_logit in entity_logits:
loss += self.loss_fn(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=1e-5), adversarial_train={'name': 'fgm'})
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result, result_prob = dict(), dict()
for (token_ids, entity_ids, extra), entity_labels in tqdm(data):
entity_logit = model.predict([token_ids, entity_ids])[0] # [btz, 实体个数, 实体类别数]
entity_logit = F.softmax(entity_logit, dim=-1)
entity_prob, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果,entity_tuple是(smp_id, ent_id, start, end, label, prob)的列表
v_pred, entity_tuple = trans_entity2tuple(entity_ids, entity_pred, entity_prob)
v_true, _ = trans_entity2tuple(entity_ids, entity_labels)
valid_pred.extend(v_pred)
valid_true.extend(v_true)
# generate submit result
for id_, ent_id_, start, end, label_, prob in entity_tuple:
label_ = label_-3
smp_id, s_e_ents = extra[id_][0], extra[id_][1]
if (start, end) not in s_e_ents:
raise ValueError('entity missing')
if smp_id not in result:
result[smp_id], result_prob[smp_id] = {}, {}
ent_name = s_e_ents[(start, end)][0]
if ent_name in result[smp_id] and prob < result[smp_id][ent_name][-1]:
# 如果同一个实体
continue
else:
result[smp_id].update({ent_name: (label_, prob)})
ent_prob = entity_logit[id_][ent_id_].cpu().numpy()
result_prob[smp_id].update({ent_name: ent_prob})
assert prob == ent_prob[label_+3]
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result, result_prob
def trans_entity2tuple(entity_ids, entity_labels, entity_probs=None):
'''把tensor转为(样本id, start, end, 实体类型, 实体概率值)的tuple用于计算指标
'''
y, ent_tuple = [], []
for i, one_sample in enumerate(entity_ids): # 遍历样本
for j, item in enumerate(one_sample): # 遍历实体
if item[0].item() * item[1].item() != 0:
tmp = (i, j, item[0].item(), item[1].item(), entity_labels[i, j].item())
y.append(entity_labels[i, j].item())
ent_tuple.append(tmp if entity_probs is None else tmp + (entity_probs[i, j].item(),))
return y, ent_tuple
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result, pred_result_prob = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
# save_result(pred_result, pred_result_prob, save_path=save_path_dev)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
def save_result(result, result_prob, save_path):
result = [(key, value) for key, value in result.items()]
result.sort(key=lambda x: x[0])
result_str = 'id\tresult\n'
for key, value in result:
result_str += f'{key}\t{value}\n'
with open(save_path, 'w', encoding='utf-8') as f:
f.write(result_str)
# 保存概率
with open(save_path[:-4] + '_prob.pkl', 'wb') as f:
pickle.dump(result_prob, f)
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result, pred_result_prob = evaluate(test_dataloader)
save_result(pred_result, pred_result_prob, save_path=save_path)
\ No newline at end of file
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.tokenizers import Tokenizer, SpTokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import transformers
import random
from sklearn.metrics import f1_score, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")
# 配置设置
pretrain_model = 'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base'
config_path = pretrain_model + '/bert4torch_config.json'
checkpoint_path = pretrain_model + '/pytorch_model.bin'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512'
save_path = f'./section1{prefix}.txt'
save_path_dev = f'./dev{prefix}.txt'
ckpt_path = f'./best_model{prefix}.pt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
use_swa = False
use_adv_train = False
# 模型设置
epochs = 10
steps_per_epoch = None
total_eval_step = None
num_warmup_steps = 4000
maxlen = 900
batch_size = 6
batch_size_eval = 64
grad_accumulation_steps = 3
categories = [-2, -1, 0, 1, 2]
mask_symbol = '<mask>'
seed_everything(19260817) # 估计随机数
# 加载数据集
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in tqdm(f.readlines(), desc="Loading data"):
taskData = json.loads(l.strip())
text2 = ''.join([ent+mask_symbol for ent in taskData['entity'].keys()])
D.append((taskData['content'], text2, taskData['entity']))
return D
def search(tokens, search_token, start_idx=0):
mask_idxs = []
for i in range(len(tokens)):
if tokens[i] == search_token:
mask_idxs.append(i+start_idx)
return mask_idxs
# 建立分词器,这里使用transformer自带的
tokenizer = transformers.XLNetTokenizerFast.from_pretrained(pretrain_model)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_entity_ids, batch_entity_labels = [], [], [], []
for text, prompt, entity in batch:
inputs = tokenizer.__call__(text=text, text_pair=prompt, add_special_tokens=True, max_length=maxlen, truncation="only_first")
token_ids, segment_ids = inputs['input_ids'], inputs['token_type_ids']
ent_ids = search(token_ids, tokenizer.mask_token_id)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_entity_ids.append(ent_ids)
batch_entity_labels.append([categories.index(label) for label in entity.values()])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device)
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, value=-1), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_segment_ids, batch_entity_ids], batch_entity_labels
# 转换数据集
all_data = load_data(f'{data_dir}/train.txt')
random.shuffle(all_data)
split_index = 2000 # int(len(all_data)*0.9)
train_dataloader = DataLoader(ListDataset(data=all_data[split_index:]), batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=all_data[:split_index]), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='xlnet')
hidden_size = self.bert.configs['hidden_size']
self.classifier = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.LeakyReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, 5)
)
def forward(self, inputs):
token_ids, segment_ids, entity_ids = inputs
last_hidden_state = self.bert([token_ids, segment_ids]) # [btz, seq_len, hdsz]
entity_ids = entity_ids.unsqueeze(2).repeat(1, 1, last_hidden_state.shape[-1])
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids)
entity_logits = self.classifier(entity_states)
return entity_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, entity_logit, labels):
loss = super().forward(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps=len(train_dataloader)*epochs, last_epoch=-1)
model.compile(loss=Loss(ignore_index=-1), optimizer=optimizer, scheduler=scheduler, max_grad_norm=1.0, adversarial_train={'name': 'fgm' if use_adv_train else ''})
# swa
if use_swa:
def average_function(ax: torch.Tensor, x: torch.Tensor, num: int) -> torch.Tensor:
return ax + (x - ax) / (num + 1)
swa_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=average_function)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
if use_swa:
swa_model.update_parameters(model)
@staticmethod
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result = dict()
for (token_ids, entity_ids), entity_labels in tqdm(data):
if use_swa:
swa_model.eval()
with torch.no_grad():
entity_logit = F.softmax(swa_model([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
else:
entity_logit = F.softmax(model.predict([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
_, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index = (entity_ids.flatten()>0).nonzero().squeeze(-1)
valid_pred.extend(entity_pred.flatten()[valid_index].cpu().tolist())
valid_true.extend(entity_labels.flatten()[valid_index].cpu().tolist())
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, grad_accumulation_steps=grad_accumulation_steps, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result = Evaluator.evaluate(valid_dataloader)
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")
# 配置设置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512'
save_path = f'./section1{prefix}.txt'
save_path_dev = f'./dev{prefix}.txt'
ckpt_path = f'./best_model{prefix}.pt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
use_swa = True
# 模型设置
epochs = 10
steps_per_epoch = None
total_eval_step = None
num_warmup_steps = 4000
maxlen = 512
batch_size = 7
batch_size_eval = 64
categories = [-2, -1, 0, 1, 2]
seed_everything(42) # 估计随机数
# 加载数据集
def load_data(filename):
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
with open(filename, encoding='utf-8') as f:
for l in tqdm(f.readlines(), desc="Loading data"):
taskData = json.loads(l.strip())
text2 = ''.join([ent+'[MASK]' for ent in taskData['entity'].keys()]) + '[SEP]'
text2_len = sum([len(ent)+1 for ent in taskData['entity'].keys()]) + 1
for t in text_segmentate(taskData['content'], maxlen-text2_len-2, seps, strips):
D.append((t, text2, taskData['entity']))
return D
def search(tokens, start_idx=0):
mask_idxs = []
for i in range(len(tokens)):
if tokens[i] == '[MASK]':
mask_idxs.append(i+start_idx)
return mask_idxs
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_entity_ids, batch_entity_labels = [], [], []
for text1, text2, entity in batch:
token_ids1 = tokenizer.encode(text1)[0]
tokens2 = tokenizer.tokenize(text2)[1:-1]
token_ids2 = tokenizer.tokens_to_ids(tokens2)
ent_ids_raw = search(tokens2, start_idx=len(token_ids1))
# 不在原文中的实体,其[MASK]标记不用于计算loss
ent_labels, ent_ids = [], []
for i, (ent, label) in enumerate(entity.items()):
if ent in text1:
assert tokens2[ent_ids_raw[i]-len(token_ids1)] == '[MASK]'
ent_ids.append(ent_ids_raw[i])
ent_labels.append(categories.index(label))
batch_token_ids.append(token_ids1 + token_ids2)
batch_entity_ids.append(ent_ids)
batch_entity_labels.append(ent_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device)
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, value=-1), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids], batch_entity_labels
# 转换数据集
all_data = load_data(f'{data_dir}/train.txt')
split_index = int(len(all_data)*0.9)
train_dataloader = DataLoader(ListDataset(data=all_data[:split_index]), batch_size=batch_size, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=all_data[split_index:]), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
hidden_size = self.bert.configs['hidden_size']
self.classifier = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.LeakyReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, 5)
)
def forward(self, inputs):
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.unsqueeze(2).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids)
entity_logits = self.classifier(entity_states)
return entity_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, entity_logit, labels):
loss = super().forward(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
optimizer = optim.Adam(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps=len(train_dataloader)*epochs, last_epoch=-1)
model.compile(loss=Loss(ignore_index=-1), optimizer=optimizer, scheduler=scheduler, adversarial_train={'name': 'fgm'})
# swa
if use_swa:
def average_function(ax: torch.Tensor, x: torch.Tensor, num: int) -> torch.Tensor:
return ax + (x - ax) / (num + 1)
swa_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=average_function)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
if use_swa:
swa_model.update_parameters(model)
@staticmethod
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result = dict()
for (token_ids, entity_ids), entity_labels in tqdm(data):
if use_swa:
swa_model.eval()
with torch.no_grad():
entity_logit = F.softmax(swa_model([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
else:
entity_logit = F.softmax(model.predict([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
_, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index = (entity_ids.flatten()>0).nonzero().squeeze(-1)
valid_pred.extend(entity_pred.flatten()[valid_index].cpu().tolist())
valid_true.extend(entity_labels.flatten()[valid_index].cpu().tolist())
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result = Evaluator.evaluate(valid_dataloader)
# 天池新闻分类
比赛链接:https://tianchi.aliyun.com/competition/entrance/531810/introduction?lang=zh-cn
| 解决方案 | 说明 | 指标 |
| ---- | ---- | ---- |
| Top1 | [Github](https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION)| 正式赛f1=0.9735 |
| Top1复现 | bert以第1折交叉epoch=5初始化,1个epoch,seed=0, 1993, 2020三者融合 | 长期赛f1=0.9736 |
| Top1_bert4torch复现 | bert+attn+fgm+cv | 长期赛f1=0.9727, dev_5cv=(0.97083, 0.97074, 0.96914, 0.96892, 0.96613)|
## 文件说明
- convert.py: 将上述链接中的tensorflow权重转为pytorch的
- training.py: finetune训练代码
\ No newline at end of file
import torch
import tensorflow as tf
tf_path = 'E:/Github/天池新闻分类/top1/pre_models/bert_model.ckpt'
torch_state_dict = {}
mapping = {
'bert/embeddings/word_embeddings': 'bert.embeddings.word_embeddings.weight',
'bert/embeddings/token_type_embeddings': 'bert.embeddings.token_type_embeddings.weight',
'bert/embeddings/position_embeddings': 'bert.embeddings.position_embeddings.weight',
'bert/embeddings/LayerNorm/beta': 'bert.embeddings.LayerNorm.bias',
'bert/embeddings/LayerNorm/gamma': 'bert.embeddings.LayerNorm.weight',
# 'bert/pooler/dense/kernel': 'bert.pooler.dense.weight',
# 'bert/pooler/dense/bias': 'bert.pooler.dense.bias',
# 'cls/seq_relationship/output_weights': 'cls.seq_relationship.weight',
# 'cls/seq_relationship/output_bias': 'cls.seq_relationship.bias',
'cls/predictions/transform/dense/kernel': 'cls.predictions.transform.dense.weight##T',
'cls/predictions/transform/dense/bias': 'cls.predictions.transform.dense.bias',
'cls/predictions/transform/LayerNorm/beta': 'cls.predictions.transform.LayerNorm.bias',
'cls/predictions/transform/LayerNorm/gamma': 'cls.predictions.transform.LayerNorm.weight',
'cls/predictions/output_bias': 'cls.predictions.bias',
}
for i in range(12):
prefix = 'bert/encoder/layer_%d/' % i
prefix_i = f'bert.encoder.layer.%d.' % i
mapping.update({
prefix + 'attention/self/query/kernel': prefix_i + 'attention.self.query.weight##T',
prefix + 'attention/self/query/bias': prefix_i + 'attention.self.query.bias',
prefix + 'attention/self/key/kernel': prefix_i + 'attention.self.key.weight##T',
prefix + 'attention/self/key/bias': prefix_i + 'attention.self.key.bias',
prefix + 'attention/self/value/kernel': prefix_i + 'attention.self.value.weight##T',
prefix + 'attention/self/value/bias': prefix_i + 'attention.self.value.bias',
prefix + 'attention/output/dense/kernel': prefix_i + 'attention.output.dense.weight##T',
prefix + 'attention/output/dense/bias': prefix_i + 'attention.output.dense.bias',
prefix + 'attention/output/LayerNorm/beta': prefix_i + 'attention.output.LayerNorm.bias',
prefix + 'attention/output/LayerNorm/gamma': prefix_i + 'attention.output.LayerNorm.weight',
prefix + 'intermediate/dense/kernel': prefix_i + 'intermediate.dense.weight##T',
prefix + 'intermediate/dense/bias': prefix_i + 'intermediate.dense.bias',
prefix + 'output/dense/kernel': prefix_i + 'output.dense.weight##T',
prefix + 'output/dense/bias': prefix_i + 'output.dense.bias',
prefix + 'output/LayerNorm/beta': prefix_i + 'output.LayerNorm.bias',
prefix + 'output/LayerNorm/gamma': prefix_i + 'output.LayerNorm.weight',
})
for old_key, new_key in mapping.items():
try:
ts = tf.train.load_variable(tf_path, old_key)
if new_key.endswith('##T'):
torch_state_dict[new_key.rstrip('##T')] = torch.from_numpy(ts).T
else:
torch_state_dict[new_key] = torch.from_numpy(ts)
except:
print('Missing ', old_key)
torch.save(torch_state_dict, 'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin')
# 模型推理脚本
# cv逐一预测,按照dev的指标加权
from copyreg import pickle
from torch import device
from training import Model, collate_fn
import torch
from torch.utils.data import DataLoader
from bert4torch.snippets import ListDataset
import pandas as pd
from tqdm import tqdm
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16
def load_data(df):
"""加载数据。"""
D = list()
for _, row in df.iterrows():
text = row['text']
D.append((text, 0))
return D
df_test = pd.read_csv('E:/Github/天池新闻分类/data/test_a.csv', sep='\t')
df_test['text'] = df_test['text'].apply(lambda x: x.strip().split())
test_data = load_data(df_test)
dev_dataloader = DataLoader(ListDataset(data=test_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
f1_score = [0.97083, 0.97074, 0.96914, 0.96892, 0.96613]
y_pred_final = 0
for i in range(5):
model = Model().to(device)
model.load_weights(f'best_model_fold{i+1}.pt')
y_pred = []
for x, _ in tqdm(dev_dataloader, desc=f'evaluate_cv{i}'):
y_pred.append(model.predict(x).cpu().numpy())
# if len(y_pred) > 10:
# break
y_pred = np.concatenate(y_pred)
y_pred_final += y_pred * f1_score[i]
np.save(f'test_cv{i}_logit.npy', y_pred)
df_test = pd.DataFrame(y_pred_final.argmax(axis=1))
df_test.columns = ['label']
df_test.to_csv('submission.csv', index=False)
\ No newline at end of file
# 模型训练脚本
# 链接:https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
# 这里仅基于bert4torch实现了Top1解决方案中的finetune部分,直接使用了原作者的预训练权重转pytorch
import numpy as np
import pandas as pd
from bert4torch.models import build_transformer_model, BaseModel
from torch.utils.data import DataLoader
from bert4torch.snippets import sequence_padding, ListDataset, Callback, EarlyStopping
from bert4torch.tokenizers import Tokenizer
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch import nn, optim
from tqdm import tqdm
# BERT base
config_path = 'E:/Github/天池新闻分类/top1/pre_models/bert_config.json'
checkpoint_path = 'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin'
dict_path = 'E:/Github/天池新闻分类/top1/pre_models/vocab.txt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
n = 5 # Cross-validation
SEED = 2020
num_classes = 14
maxlen = 512
max_segment = 2
batch_size = 4
grad_accum_steps = 64
drop = 0.2
lr = 2e-5
epochs = 100
def load_data(df):
"""加载数据。"""
D = list()
for _, row in df.iterrows():
text = row['text']
label = row['label']
D.append((text, int(label)))
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def sentence_split(words):
"""句子截断。"""
document_len = len(words)
index = list(range(0, document_len, maxlen-2))
index.append(document_len)
segments = []
for i in range(len(index) - 1):
segment = words[index[i]: index[i + 1]]
assert len(segment) > 0
segment = tokenizer.tokens_to_ids(['[CLS]'] + segment + ['[SEP]'])
segments.append(segment)
assert len(segments) > 0
if len(segments) > max_segment:
segment_ = int(max_segment / 2)
return segments[:segment_] + segments[-segment_:]
else:
return segments
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = sentence_split(text)
token_ids = sequence_padding(token_ids, length=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append(label)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=max_segment), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, device=device)
return batch_token_ids, batch_labels
class Attention(nn.Module):
"""注意力层。"""
def __init__(self, hidden_size, **kwargs):
self.hidden_size = hidden_size
super().__init__(**kwargs)
self.weight = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.bias = nn.Parameter(torch.zeros(self.hidden_size))
self.query = nn.Linear(self.hidden_size, 1, bias=False)
def forward(self, x, mask):
'''x: [btz, max_segment, hdsz]
mask: [btz, max_segment, 1]
'''
mask = mask.squeeze(2) # [btz, max_segment]
# linear
key = self.weight(x) + self.bias # [btz, max_segment, hdsz]
# compute attention
outputs = self.query(key).squeeze(2) # [btz, max_segment]
outputs -= 1e32 * (1 - mask)
attn_scores = F.softmax(outputs, dim=-1)
attn_scores = attn_scores * mask
attn_scores = attn_scores.reshape(-1, 1, attn_scores.shape[-1]) # [btz, 1, max_segment]
outputs = torch.matmul(attn_scores, key).squeeze(1) # [btz, hdsz]
return outputs
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dropout1 = nn.Dropout(0.1)
self.dropout2 = nn.Dropout(0.1)
self.attn = Attention(768)
self.dense = nn.Linear(768, num_classes)
def forward(self, token_ids):
''' token_ids: [btz, max_segment, max_len]
'''
input_mask = torch.any(token_ids, dim=-1, keepdim=True).long() # [btz, max_segment, 1]
token_ids = token_ids.reshape(-1, token_ids.shape[-1]) # [btz*max_segment, max_len]
output = self.bert([token_ids])[:, 0] # [btz*max_segment, hdsz]
output = output.reshape((-1, max_segment, output.shape[-1])) # [btz, max_segment, hdsz]
output = output * input_mask
output = self.dropout1(output)
output = self.attn(output, input_mask)
output = self.dropout2(output)
output = self.dense(output)
return output
class Evaluator(Callback):
def __init__(self, model, dataloader, fold):
super().__init__()
self.model = model
self.dataloader = dataloader
self.best_val_f1 = 0.
self.fold = fold
def evaluate(self):
y_true, y_pred = list(), list()
for x, y in tqdm(self.dataloader, desc='evaluate'):
y_true.append(y.cpu().numpy())
y_pred.append(self.model.predict(x).argmax(axis=1).cpu().numpy())
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
return f1
def on_epoch_end(self, steps, epoch, logs=None):
val_f1 = self.evaluate()
if val_f1 > self.best_val_f1:
self.best_val_f1 = val_f1
self.model.save_weights(f'best_model_fold{self.fold}.pt')
logs['val_f1'] = val_f1 # 这个要设置,否则EarlyStopping不生效
print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}\n')
def do_train(df_train):
skf = StratifiedKFold(n_splits=n, random_state=SEED, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train['text'], df_train['label']), 1):
print(f'[Fold {fold}]')
train_data = load_data(df_train.iloc[trn_idx])
valid_data = load_data(df_train.iloc[val_idx])
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=valid_data), batch_size=batch_size, collate_fn=collate_fn)
model = Model().to(device)
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=lr), adversarial_train={'name': 'fgm'})
callbacks = [
Evaluator(model, valid_dataloader, fold),
EarlyStopping(monitor='val_f1', patience=5, verbose=1, mode='max'), # 需要在Evaluator后面
]
model.fit(
train_dataloader,
steps_per_epoch=None,
epochs=epochs,
grad_accumulation_steps=grad_accum_steps,
callbacks=callbacks
)
del model
if __name__ == '__main__':
df_train = pd.read_csv('E:/Github/天池新闻分类/data/train_set.csv', sep='\t')
df_train['text'] = df_train['text'].apply(lambda x: x.strip().split())
do_train(df_train)
#! -*- coding:utf-8 -*-
# 句子对分类任务,LCQMC数据集
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tensorboardX import SummaryWriter
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir='./summary') # prepare summary writer
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text1, text2, label = l.strip().split('\t')
D.append((text1, text2, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token_ids, segment_ids = tokenizer.encode(text1, text2, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return (batch_token_ids, batch_segment_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_batch_end(self, global_step, batch, logs=None):
if global_step % 10 == 0:
writer.add_scalar(f"train/loss", logs['loss'], global_step)
val_acc = evaluate(valid_dataloader)
writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# valid_acc: 94.72, test_acc: 94.11
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir='./summary') # prepare summary writer
choice = 'train' # train表示训练,infer表示推理
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
# def on_batch_end(self, global_step, batch, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
def inference(texts):
'''单条样本推理
'''
for text in texts:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]
logit = model.predict([token_ids, segment_ids])
y_pred = torch.argmax(torch.softmax(logit, dim=-1)).cpu().numpy()
print(text, ' ----> ', y_pred)
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
inference(['我今天特别开心', '我今天特别生气'])
#! -*- coding:utf-8 -*-
# 情感分类任务,加载GAU-alpha权重
# 博客:https://kexue.fm/archives/9052
# 权重转换脚本:./convert_script/convert_GAU_alpha.py
# valid_acc: 95.25, test_acc: 94.46
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gau_alpha')
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
last_hidden_state = self.bert([token_ids, segment_ids])
output = self.dropout(last_hidden_state[:, 0, :])
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
# def on_batch_end(self, global_step, batch, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment