Commit 0e29b9b7 authored by xuxo's avatar xuxo
Browse files

yidong infer init

parents
Pipeline #3252 failed with stages
in 0 seconds
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于“半指针-半标注”结构
# 文章介绍:https://kexue.fm/archives/7161
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
import numpy as np
from bert4torch.layers import LayerNorm
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn as nn
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def collate_fn(batch):
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
batch_token_ids, batch_segment_ids = [], []
batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
for d in batch:
token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
# 整理三元组 {s: [(o, p)]}
spoes = {}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
s_idx = search(s, token_ids)
o_idx = search(o, token_ids)
if s_idx != -1 and o_idx != -1:
s = (s_idx, s_idx + len(s) - 1)
o = (o_idx, o_idx + len(o) - 1, p)
if s not in spoes:
spoes[s] = []
spoes[s].append(o)
if spoes:
# subject标签
subject_labels = np.zeros((len(token_ids), 2))
for s in spoes:
subject_labels[s[0], 0] = 1 # subject首
subject_labels[s[1], 1] = 1 # subject尾
# 随机选一个subject(这里没有实现错误!这就是想要的效果!!)
# Todo: 感觉可以对未选到的subject加个mask,这样计算loss就不会计算到,可能因为模型对prob**n正例加权重导致影响不大
start, end = np.array(list(spoes.keys())).T
start = np.random.choice(start)
end = np.random.choice(end[end >= start])
subject_ids = (start, end)
# 对应的object标签
object_labels = np.zeros((len(token_ids), len(predicate2id), 2))
for o in spoes.get(subject_ids, []):
object_labels[o[0], o[2], 0] = 1
object_labels[o[1], o[2], 1] = 1
# 构建batch
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_subject_labels.append(subject_labels)
batch_subject_ids.append(subject_ids)
batch_object_labels.append(object_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_subject_labels = torch.tensor(sequence_padding(batch_subject_labels), dtype=torch.float, device=device)
batch_subject_ids = torch.tensor(batch_subject_ids, dtype=torch.long, device=device)
batch_object_labels = torch.tensor(sequence_padding(batch_object_labels), dtype=torch.float, device=device)
batch_attention_mask = (batch_token_ids != tokenizer._token_pad_id)
return [batch_token_ids, batch_segment_ids, batch_subject_ids], [batch_subject_labels, batch_object_labels, batch_attention_mask]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path)
self.linear1 = nn.Linear(768, 2)
self.condLayerNorm = LayerNorm(hidden_size=768, conditional_size=768*2)
self.linear2 = nn.Linear(768, len(predicate2id)*2)
@staticmethod
def extract_subject(inputs):
"""根据subject_ids从output中取出subject的向量表征
"""
output, subject_ids = inputs
start = torch.gather(output, dim=1, index=subject_ids[:, :1].unsqueeze(2).expand(-1, -1, output.shape[-1]))
end = torch.gather(output, dim=1, index=subject_ids[:, 1:].unsqueeze(2).expand(-1, -1, output.shape[-1]))
subject = torch.cat([start, end], 2)
return subject[:, 0]
def forward(self, inputs):
# 预测subject
seq_output = self.bert(inputs[:2]) # [btz, seq_len, hdsz]
subject_preds = (torch.sigmoid(self.linear1(seq_output)))**2 # [btz, seq_len, 2]
# 传入subject,预测object
# 通过Conditional Layer Normalization将subject融入到object的预测中
subject_ids = inputs[2]
# 理论上应该用LayerNorm前的,但是这样只能返回各个block顶层输出,这里和keras实现不一致
subject = self.extract_subject([seq_output, subject_ids])
output = self.condLayerNorm([seq_output, subject])
output = (torch.sigmoid(self.linear2(output)))**4
object_preds = output.reshape(*output.shape[:2], len(predicate2id), 2)
return [subject_preds, object_preds]
def predict_subject(self, inputs):
self.eval()
with torch.no_grad():
seq_output = self.bert(inputs[:2]) # [btz, seq_len, hdsz]
subject_preds = (torch.sigmoid(self.linear1(seq_output)))**2 # [btz, seq_len, 2]
return [seq_output, subject_preds]
def predict_object(self, inputs):
self.eval()
with torch.no_grad():
seq_output, subject_ids = inputs
subject = self.extract_subject([seq_output, subject_ids])
output = self.condLayerNorm([seq_output, subject])
output = (torch.sigmoid(self.linear2(output)))**4
object_preds = output.reshape(*output.shape[:2], len(predicate2id), 2)
return object_preds
train_model = Model().to(device)
class BCELoss(nn.BCELoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, inputs, targets):
subject_preds, object_preds = inputs
subject_labels, object_labels, mask = targets
# sujuect部分loss
subject_loss = super().forward(subject_preds, subject_labels)
subject_loss = subject_loss.mean(dim=-1)
subject_loss = (subject_loss * mask).sum() / mask.sum()
# object部分loss
object_loss = super().forward(object_preds, object_labels)
object_loss = object_loss.mean(dim=-1).sum(dim=-1)
object_loss = (object_loss * mask).sum() / mask.sum()
return subject_loss + object_loss
train_model.compile(loss=BCELoss(reduction='none'), optimizer=optim.Adam(train_model.parameters(), 1e-5))
def extract_spoes(text):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=device)
# 抽取subject
seq_output, subject_preds = train_model.predict_subject([token_ids, segment_ids])
subject_preds[:, [0, -1]] *= 0 # 首cls, 尾sep置为0
start = torch.where(subject_preds[0, :, 0] > 0.6)[0]
end = torch.where(subject_preds[0, :, 1] > 0.5)[0]
subjects = []
for i in start:
j = end[end >= i]
if len(j) > 0:
j = j[0]
subjects.append((i.item(), j.item()))
if subjects:
spoes = []
# token_ids = token_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
# segment_ids = segment_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
seq_output = seq_output.repeat([len(subjects)]+[1]*(len(seq_output.shape)-1))
subjects = torch.tensor(subjects, dtype=torch.long, device=device)
# 传入subject,抽取object和predicate
object_preds = train_model.predict_object([seq_output, subjects])
object_preds[:, [0, -1]] *= 0
for subject, object_pred in zip(subjects, object_preds):
start = torch.where(object_pred[:, :, 0] > 0.6)
end = torch.where(object_pred[:, :, 1] > 0.5)
for _start, predicate1 in zip(*start):
for _end, predicate2 in zip(*end):
if _start <= _end and predicate1 == predicate2:
spoes.append(
((mapping[subject[0]][0],
mapping[subject[1]][-1]), predicate1.item(),
(mapping[_start][0], mapping[_end][-1]))
)
break
return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1])
for s, p, o, in spoes]
else:
return []
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (
tuple(tokenizer.tokenize(spo[0])),
spo[1],
tuple(tokenizer.tokenize(spo[2])),
)
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
R = set([SPO(spo) for spo in extract_spoes(d['text'])])
T = set([SPO(spo) for spo in d['spo_list']])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
pbar.update()
pbar.set_description(
'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)
)
s = json.dumps({
'text': d['text'],
'spo_list': list(T),
'spo_list_pred': list(R),
'new': list(R - T),
'lack': list(T - R),
},
ensure_ascii=False,
indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
# optimizer.apply_ema_weights()
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# train_model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
if __name__ == '__main__':
evaluator = Evaluator()
train_model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
train_model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于GlobalPointer的仿TPLinker设计
# 文章介绍:https://kexue.fm/archives/8888
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.layers import GlobalPointer
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.losses import SparseMultilabelCategoricalCrossentropy
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def collate_fn(batch):
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
batch_token_ids, batch_segment_ids = [], []
batch_entity_labels, batch_head_labels, batch_tail_labels = [], [], []
for d in batch:
token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
# 整理三元组 {s: [(o, p)]}
spoes = set()
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids)
oh = search(o, token_ids)
if sh != -1 and oh != -1:
spoes.add((sh, sh + len(s) - 1, p, oh, oh + len(o) - 1))
# 构建标签
entity_labels = [set() for _ in range(2)]
head_labels = [set() for _ in range(len(predicate2id))]
tail_labels = [set() for _ in range(len(predicate2id))]
for sh, st, p, oh, ot in spoes:
entity_labels[0].add((sh, st))
entity_labels[1].add((oh, ot))
head_labels[p].add((sh, oh))
tail_labels[p].add((st, ot))
for label in entity_labels + head_labels + tail_labels:
if not label: # 至少要有一个标签
label.add((0, 0)) # 如果没有则用0填充
entity_labels = sequence_padding([list(l) for l in entity_labels]) # [subject/object=2, 实体个数, 实体起终点]
head_labels = sequence_padding([list(l) for l in head_labels]) # [关系个数, 该关系下subject/object配对数, subject/object起点]
tail_labels = sequence_padding([list(l) for l in tail_labels]) # [关系个数, 该关系下subject/object配对数, subject/object终点]
# 构建batch
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_entity_labels.append(entity_labels)
batch_head_labels.append(head_labels)
batch_tail_labels.append(tail_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
# batch_entity_labels: [btz, subject/object=2, 实体个数, 实体起终点]
# batch_head_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object起点]
# batch_tail_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object终点]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, seq_dims=2), dtype=torch.float, device=device)
batch_head_labels = torch.tensor(sequence_padding(batch_head_labels, seq_dims=2), dtype=torch.float, device=device)
batch_tail_labels = torch.tensor(sequence_padding(batch_tail_labels, seq_dims=2), dtype=torch.float, device=device)
return [batch_token_ids, batch_segment_ids], [batch_entity_labels, batch_head_labels, batch_tail_labels]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path)
self.entity_output = GlobalPointer(hidden_size=768, heads=2, head_size=64)
self.head_output = GlobalPointer(hidden_size=768, heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False)
self.tail_output = GlobalPointer(hidden_size=768, heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False)
def forward(self, inputs):
hidden_states = self.bert(inputs) # [btz, seq_len, hdsz]
mask = inputs[0].gt(0).long()
entity_output = self.entity_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
head_output = self.head_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
tail_output = self.tail_output(hidden_states, mask) # [btz, heads, seq_len, seq_len]
return entity_output, head_output, tail_output
model = Model().to(device)
class MyLoss(SparseMultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_trues):
''' y_preds: [Tensor], shape为[btz, heads, seq_len ,seq_len]
'''
loss_list = []
for y_pred, y_true in zip(y_preds, y_trues):
shape = y_pred.shape
# 乘以seq_len是因为(i, j)在展开到seq_len*seq_len维度对应的下标是i*seq_len+j
y_true = y_true[..., 0] * shape[2] + y_true[..., 1] # [btz, heads, 实体起终点的下标]
y_pred = y_pred.reshape(shape[0], -1, np.prod(shape[2:])) # [btz, heads, seq_len*seq_len]
loss = super().forward(y_pred, y_true.long())
loss = torch.mean(torch.sum(loss, dim=1))
loss_list.append(loss)
return {'loss': sum(loss_list)/3, 'entity_loss': loss_list[0], 'head_loss': loss_list[1], 'tail_loss': loss_list[2]}
model.compile(loss=MyLoss(mask_zero=True), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['entity_loss', 'head_loss', 'tail_loss'])
def extract_spoes(text, threshold=0):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=device)
outputs = model.predict([token_ids, segment_ids])
outputs = [o[0].cpu().numpy() for o in outputs] # [heads, seq_len, seq_len]
# 抽取subject和object
subjects, objects = set(), set()
outputs[0][:, [0, -1]] -= float('inf')
outputs[0][:, :, [0, -1]] -= float('inf')
for l, h, t in zip(*np.where(outputs[0] > threshold)):
if l == 0:
subjects.add((h, t))
else:
objects.add((h, t))
# 识别对应的predicate
spoes = set()
for sh, st in subjects:
for oh, ot in objects:
p1s = np.where(outputs[1][:, sh, oh] > threshold)[0]
p2s = np.where(outputs[2][:, st, ot] > threshold)[0]
ps = set(p1s) & set(p2s)
for p in ps:
spoes.add((
text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[p],
text[mapping[oh][0]:mapping[ot][-1] + 1]
))
return list(spoes)
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
R = set([SPO(spo) for spo in extract_spoes(d['text'])])
T = set([SPO(spo) for spo in d['spo_list']])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
# optimizer.apply_ema_weights()
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %(f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker, cat方式实体部分收敛较快,关系部分收敛较慢
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.layers import TplinkerHandshakingKernel
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
maxlen = 50
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_entity_labels: [btz, pair_len]
# batch_head_labels: [btz, rel_size, pair_len]
# batch_tail_labels: [btz, rel_size, pair_len]
batch_entity_labels = torch.zeros((len(batch), pair_len), dtype=torch.long, device=device)
batch_head_labels = torch.zeros((len(batch), len(predicate2id), pair_len), dtype=torch.long, device=device)
batch_tail_labels = torch.zeros((len(batch), len(predicate2id), pair_len), dtype=torch.long, device=device)
batch_token_ids = []
for i, d in enumerate(batch):
token_ids = tokenizer.encode(d['text'])[0][1:-1][:maxlen] # 这里要限制取前max_len个
batch_token_ids.append(token_ids)
# 整理三元组 {s: [(o, p)]}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids) # 这里超过长度就会找不到
oh = search(o, token_ids)
if sh != -1 and oh != -1:
st, ot = sh+len(s)-1, oh+len(o)-1
batch_entity_labels[i, map_ij2k[sh, st]] = 1
batch_entity_labels[i, map_ij2k[oh, ot]] = 1
if sh <= oh:
batch_head_labels[i, p, map_ij2k[sh, oh]] = 1
else:
batch_head_labels[i, p, map_ij2k[oh, sh]] = 2
if st <= ot:
batch_tail_labels[i, p, map_ij2k[st, ot]] = 1
else:
batch_tail_labels[i, p, map_ij2k[ot, st]] = 2
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], [batch_entity_labels, batch_head_labels, batch_tail_labels]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0)
self.combine_fc = nn.Linear(768*2, 768)
self.ent_fc = nn.Linear(768, 2)
self.head_rel_fc = nn.Linear(768, len(predicate2id)*3)
self.tail_rel_fc = nn.Linear(768, len(predicate2id)*3)
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cat')
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state) # [btz, pair_len, hdsz]
ent_shaking_outputs = self.ent_fc(shaking_hiddens) # [btz, pair_len, 2]
btz, pair_len = shaking_hiddens.shape[:2]
head_rel_shaking_outputs = self.head_rel_fc(shaking_hiddens).reshape(btz, -1, pair_len, 3) #[btz, predicate_num, pair_len, 3]
tail_rel_shaking_outputs = self.tail_rel_fc(shaking_hiddens).reshape(btz, -1, pair_len, 3)
return ent_shaking_outputs, head_rel_shaking_outputs, tail_rel_shaking_outputs
model = Model().to(device)
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_trues):
loss_list = []
for y_pred, y_true in zip(y_preds, y_trues):
loss = super().forward(y_pred.view(-1, y_pred.size()[-1]), y_true.view(-1))
loss_list.append(loss)
z = (2 * len(predicate2id) + 1)
total_steps = 6000 # 前期实体识别的权重高一些,建议也可以设置为model.total_steps
w_ent = max(1 / z + 1 - model.global_step / total_steps, 1 / z)
w_rel = min((len(predicate2id) / z) * model.global_step / total_steps, (len(predicate2id) / z))
loss = w_ent*loss_list[0] + w_rel*loss_list[1] + w_rel*loss_list[2]
return {'loss': loss, 'entity_loss': loss_list[0], 'head_loss': loss_list[1], 'tail_loss': loss_list[2]}
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), 5e-5), metrics=['entity_loss', 'head_loss', 'tail_loss'])
def extract_spoes(text):
"""抽取输入text所包含的三元组
"""
def get_spots_fr_shaking_tag(shaking_tag):
'''解析关系
'''
spots = []
for shaking_inds in shaking_tag.nonzero():
rel_id = shaking_inds[0].item()
tag_id = shaking_tag[rel_id][shaking_inds[1]].item()
matrix_inds = map_k2ij[shaking_inds[1].item()]
# 保证前面是subject,后面是object
if tag_id == 1:
spot = (rel_id, matrix_inds[0], matrix_inds[1])
elif tag_id == 2:
spot = (rel_id, matrix_inds[1], matrix_inds[0])
spots.append(spot)
return spots
tokens = tokenizer.tokenize(text)[1:-1]
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.encode(text)[0][1:-1]
token_ids_ts = torch.tensor(sequence_padding([token_ids], length=maxlen), dtype=torch.long, device=device)
outputs = model.predict([token_ids_ts])
outputs = [o[0].argmax(dim=-1) for o in outputs]
# 抽取entity
ent_matrix_spots = set()
ent_text = set()
for shaking_ind in outputs[0].nonzero():
shaking_ind_ = shaking_ind[0].item()
# tag_id = outputs[0][shaking_ind_]
matrix_inds = map_k2ij[shaking_ind_]
spot = (matrix_inds[0], matrix_inds[1])
if (spot[0] < len(mapping)) and (spot[1] < len(mapping)): # 实体起始在mapping范围内
ent_matrix_spots.add(spot)
ent_text.add(text[mapping[spot[0]][0]:mapping[spot[1]][-1] + 1])
# 识别对应的predicate
head_rel_matrix_spots = get_spots_fr_shaking_tag(outputs[1])
tail_rel_matrix_spots = get_spots_fr_shaking_tag(outputs[2])
spoes = []
for rel_h, sh, oh in head_rel_matrix_spots:
for rel_t, st, ot in tail_rel_matrix_spots:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if (rel_h == rel_t) and ((sh, st) in ent_matrix_spots) and ((oh, ot) in ent_matrix_spots):
spoes.append((text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[rel_h], text[mapping[oh][0]:mapping[ot][-1] + 1]))
return spoes, token_ids, ent_text
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
E1, E2 = 0, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
spoes, token_ids, ent_text_pred = extract_spoes(d['text'])
# spo_list是用来根据maxlen删减的
spo_list = []
for s, p, o in d['spo_list']:
s_ = tokenizer.encode(s)[0][1:-1]
o_ = tokenizer.encode(o)[0][1:-1]
sh = search(s_, token_ids) # 这里超过长度就会找不到
oh = search(o_, token_ids)
if sh != -1 and oh != -1:
spo_list.append((s, p, o))
# 计算三元组的f1值
R = set([SPO(spo) for spo in spoes])
T = set([SPO(spo) for spo in spo_list])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
# 计算实体的指标
ent_text_truth = set([spo[0] for spo in spo_list] + [spo[-1] for spo in spo_list])
E1 += len(ent_text_pred & ent_text_truth)
E2 += len(ent_text_truth)
E_acc = E1 / E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f' % (f1, precision, recall, E_acc))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker_plus
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import json
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import TplinkerHandshakingKernel
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
maxlen = 50
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载标签字典
predicate2id, id2predicate = {}, {}
with open('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object']) for spo in l['spo_list']]})
return D
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def tran_ent_rel2id():
'''获取最后一个分类层的的映射关系
'''
tag2id = {'ent': 0}
for p in predicate2id.keys():
for mode in ['sh_oh', 'oh_sh', 'st_ot', 'ot_st']:
tag2id[p+'##'+mode] = len(tag2id)
return tag2id
tag2id = tran_ent_rel2id()
id2tag = {v: k for k, v in tag2id.items()}
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels = torch.zeros((len(batch), pair_len, len(tag2id)), dtype=torch.long, device=device)
batch_token_ids = []
for i, d in enumerate(batch):
token_ids = tokenizer.encode(d['text'])[0][1:-1][:maxlen] # 这里要限制取前max_len个
batch_token_ids.append(token_ids)
# 整理三元组 {s: [(o, p)]}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
o = tokenizer.encode(o)[0][1:-1]
sh = search(s, token_ids)
oh = search(o, token_ids)
if sh != -1 and oh != -1:
st, ot = sh+len(s)-1, oh+len(o)-1
batch_labels[i, map_ij2k[sh, st], tag2id['ent']] = 1
batch_labels[i, map_ij2k[oh, ot], tag2id['ent']] = 1
if sh <= oh:
batch_labels[i, map_ij2k[sh, oh], tag2id[p+'##sh_oh']] = 1
else:
batch_labels[i, map_ij2k[oh, sh], tag2id[p+'##oh_sh']] = 1
if st <= ot:
batch_labels[i, map_ij2k[st, ot], tag2id[p+'##st_ot']] = 1
else:
batch_labels[i, map_ij2k[ot, st], tag2id[p+'##ot_st']] = 1
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0)
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cln_plus', inner_enc_type='lstm')
self.fc = nn.Linear(768, len(tag2id))
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state)
output = self.fc(shaking_hiddens) # [btz, pair_len, tag_size]
return output
model = Model().to(device)
model.compile(loss=MultilabelCategoricalCrossentropy(), optimizer=optim.Adam(model.parameters(), 5e-5))
def extract_spoes(text, threshold=0):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text)[1:-1]
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.encode(text)[0][1:-1]
token_ids_ = torch.tensor(sequence_padding([token_ids], length=maxlen), dtype=torch.long, device=device)
outputs = model.predict([token_ids_])[0].cpu().numpy() # [pair_len, tag_size]
# 抽取entity, 识别对应的predicate
ent_matrix_spots, ent_text = set(), set()
head_rel_matrix_spots, tail_rel_matrix_spots = [], []
for shaking_ind, tag_id in zip(*np.where(outputs > threshold)):
matrix_inds = map_k2ij[shaking_ind]
spot = (matrix_inds[0], matrix_inds[1])
if (spot[0] < len(mapping)) and (spot[1] < len(mapping)): # 实体起始在mapping范围内
p = id2tag[tag_id].split('##')[0]
if id2tag[tag_id] == 'ent':
ent_matrix_spots.add(spot)
ent_text.add(text[mapping[spot[0]][0]:mapping[spot[1]][-1] + 1])
else:
p = predicate2id[p]
if id2tag[tag_id].endswith('##sh_oh'):
head_rel_matrix_spots.append((p, spot[0], spot[1]))
elif id2tag[tag_id].endswith('##oh_sh'):
head_rel_matrix_spots.append((p, spot[1], spot[0]))
elif id2tag[tag_id].endswith('##st_ot'):
tail_rel_matrix_spots.append((p, spot[0], spot[1]))
elif id2tag[tag_id].endswith('##ot_st'):
tail_rel_matrix_spots.append((p, spot[1], spot[0]))
spoes = []
for rel_h, sh, oh in head_rel_matrix_spots:
for rel_t, st, ot in tail_rel_matrix_spots:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if (rel_h == rel_t) and ((sh, st) in ent_matrix_spots) and ((oh, ot) in ent_matrix_spots):
spoes.append((text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[rel_h], text[mapping[oh][0]:mapping[ot][-1] + 1]))
return spoes, token_ids, ent_text
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (tuple(tokenizer.tokenize(spo[0])), spo[1], tuple(tokenizer.tokenize(spo[2])))
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 0, 1e-10, 1e-10
E1, E2 = 0, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
spoes, token_ids, ent_text_pred = extract_spoes(d['text'])
# spo_list是用来根据maxlen删减的
spo_list = []
for s, p, o in d['spo_list']:
s_ = tokenizer.encode(s)[0][1:-1]
o_ = tokenizer.encode(o)[0][1:-1]
sh = search(s_, token_ids) # 这里超过长度就会找不到
oh = search(o_, token_ids)
if sh != -1 and oh != -1:
spo_list.append((s, p, o))
# 计算三元组的f1值
R = set([SPO(spo) for spo in spoes])
T = set([SPO(spo) for spo in spo_list])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
# 计算实体的指标
ent_text_truth = set([spo[0] for spo in spo_list] + [spo[-1] for spo in spo_list])
E1 += len(ent_text_pred & ent_text_truth)
E2 += len(ent_text_truth)
E_acc = E1 / E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar.update()
pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f' % (f1, precision, recall, E_acc))
s = json.dumps({'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R),
'new': list(R - T), 'lack': list(T - R)}, ensure_ascii=False, indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataset.data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print('f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# 搜狐基于实体的情感分类
- 比赛链接:https://www.biendata.xyz/competition/sohu_2022/
| 解决方案 | 链接 | 指标 |
| ---- | ---- | ---- |
| Top1 | [知乎](https://zhuanlan.zhihu.com/p/533808475)| 初赛f1=0.7253, 复赛f1=0.8173 |
| baseline | —— | 初赛f1=0.6737 |
# bert4torch复现
- 预训练模型使用xlnet
- 由于比赛结束无法提交,复现只使用线下dev作为对比
- dev为前2000,未使用方案中的后10%作为dev, dev指标略微有点不稳定
| 复现方案 | 方案 | 指标 |
| ---- | ---- | ---- |
| Top1_github | 前2000为dev, 不使用swa, 有warmup, 无label_smoothing, 无fgm, 梯度累积=3, 无rdrop | Epoch 5/10: f1=0.7697|
| Top1_bert4torch复现1 | 参数同上 | Epoch 8/10: f1=0.7556 |
| Top1_bert4torch复现2 | 参数同上+fgm+swa | Epoch 5/10: f1=0.7877 |
| Epoch | Top1_github | Top1_bert4torch复现1 | Top1_bert4torch复现2 |
| ---- | ---- | ---- | ---- |
| 1 | 0.728 | 0.7039 | 0.0274 |
| 2 | 0.7198 | 0.7327 | 0.7180 |
| 3 | 0.747 | 0.7531 | 0.7453 |
| 4 | 0.7625 | 0.7466 | 0.7594 |
| 5 | **0.7697** | 0.7464 | **0.7877** |
| 6 | 0.7638 | 0.7272 | 0.7726 |
| 7 | 0.7415 | 0.7471 | 0.7804 |
| 8 | 0.7593 | **0.7556** | 0.7829 |
| 9 | 0.7477 | 0.7455 | 0.7697 |
| 10 | 0.7466 | 0.7471 | 0.7620 |
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1883\n",
"样本总量: 89195\n",
"================================样本0, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 1, \"content\": \"3.新疆棉是全球业界公认的高品质天然纤维原料,较好满足了全球范围内对棉制纺织品服装的刚性消费需求,是中国乃至全球纺织工业健康可持续发展的重要原料保障。近年来,新疆地区不仅棉花种植生产保持稳定,棉纺织及服装产业也迅速发展,为促进地区经济发展、解决各族人民就业、改善民生福祉发挥了重要作用。新疆棉花种植和棉纺产业是全球纺织供应链的重要组成部分,2021年,新疆棉产量512.9万吨,约占全球棉花产量的20%,美国政府打压新疆棉花及其制品的行为,势必严重危害全球纺织产业供应链的安全,损害全球数千万产业工人的切身利益,对此我们表示强烈反对。4.2021年1月,新疆纺织行业协会发布了详实、客观的《新疆棉纺织行业社会责任报告》,报告以详实的数据和资料充分说明中国新疆维吾尔自治区不存在所谓的“强迫劳动”。我们建议全球纺织业界各相关利益方查阅《报告》的内容和观点,尊重从事实出发的价值观,拒绝虚伪的政治操作,反对恶意造谣。我们欢迎包括美国同业在内的国际品牌、机构实地走访考察新疆棉花产区、纺织服装工厂,独立了解、判断相关事实。我们愿为相关考察和贸易投资合作提供便利与协助。\", \"entity\": {\"美国\": 0, \"中国\": 0}}\\n']\n",
"================================样本1, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 22269, \"content\": \"新华社北京8月27日电美国疾病控制和预防中心日前发布的一项研究结果显示,新冠变异病毒德尔塔毒株成为主要流行毒株后,在美获批的疫苗整体有效性降低约三分之一。研究人员分析了抗疫一线工作人员从2020年12月14日开始的疫苗接种和新冠感染情况。美国多个州的数千名抗疫一线工作人员参加了这项研究,他们每周接受核酸检测。在德尔塔毒株成为主要流行毒株期间,488名没有接种疫苗者中有19人感染,其中有症状感染者的比例为94.7%;2352名完全接种疫苗者中有24人感染,其中有症状感染者的比例为75%。现有研究没有包含感染后的病情严重程度。研究人员分析各种因素后认为,在德尔塔毒株成为主要流行毒株后,美国辉瑞、莫德纳和强生疫苗的整体有效性为66%。而先前发布的数据显示,截至2021年4月10日,这些疫苗的整体有效性为91%。据媒体报道,研究人员计划进一步分析不同疫苗的有效性,以及接种疫苗者和未接种疫苗者被感染后的症状特征等。(完)\", \"entity\": {\"毒株\": 0, \"德尔塔\": 0}}\\n']\n",
"================================样本2, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 44594, \"content\": \"民航局2022年1月21日发布的熔断指令去年底,多班自美国飞往中国的航班推迟或取消,曾引起关注。中国外交部发言人赵立坚在去年12月就达美航空赴华航班中途返航一事回应表示,近日,多班自美国飞往中国的航班推迟或取消,美国航空在距离飞机起飞仅有数小时的情况下突然宣布取消航班,达美航空的航班甚至出现航程过半后返航情况,给中国籍乘客带来巨大损失。中国驻美使领馆积极向有关乘客提供协助,并第一时间向美国有关航司提出严正交涉,敦促其保障乘客正当权益。中国外交部发言人华春莹去年8月表示,众所周知,国际定期客运航班熔断/熔控措施是降低疫情跨境传播风险的重要举措,该措施对中外航空公司一视同仁,公平公开。在中美航线上,中国国内的国航、东航等航空公司都曾熔断过,对于没有触发熔断条件的航空公司,中方从未实施该措施,因此这次美方没有理由限制中国赴美航班客座率,美方做法非常不合理。为何熔断航班激增值得注意的是,早在去年8月,美国交通部就曾要求中国的航空公司在未来四周内,将部分中国赴美航班的客座率限制在40%,当时也是对于美联航被触发“熔断”措施的回应。\", \"entity\": {\"中国\": 0, \"航班\": 0}}\\n']\n",
"================================样本3, train: 66897 dev: 22298 dev_type2: 470\n",
"['{\"id\": 66896, \"content\": \"当地时间11月5日晚,在英国伦敦的“百万面具游行”(Million Mask March)活动过程中,抗议者与警方发生冲突,致8名警察受伤,十余名抗议者被捕。 据英国《卫报》5日报道,当天夜晚,数百名抗议者聚集在英国伦敦,参加一年一度的游行。在游行过程中,参与者抗议政府越权、收入不平等,以及最近新出台的新冠疫情限制措施。 报道称,部分抗议者在游行中与警方发生冲突。伦敦警察厅表示,在伦敦各处的示威活动中,共有12人因各种违法行为被拘捕,此外,已有8名警察在与抗议者的冲突中受伤。 伦敦警察厅还在社交平台发布声明称,“有部分人在议会广场上燃放烟花和爆竹,该行为非常危险。警方为防止民众受到伤害而进入人群。” 据此前报道,“百万面具游行”活动于2011年由一个匿名黑客论坛发起,旨在以游行示威的方式反对审查制度、腐败和战争。\", \"entity\": {\"伦敦\": 0, \"英国\": 0}}\\n']\n"
]
}
],
"source": [
"from sklearn.model_selection import StratifiedKFold\n",
"import json\n",
"with open('E:/Github/Sohu2022/Sohu2022_data/nlp_data/train.txt', 'r', encoding='utf-8') as f:\n",
" train_data = f.readlines()\n",
"tag2_index = []\n",
"for line in train_data:\n",
" line = json.loads(line)\n",
" if 2 in set(line['entity'].values()):\n",
" tag2_index.append(1)\n",
" else:\n",
" tag2_index.append(0)\n",
"print(sum(tag2_index))\n",
" \n",
"print('样本总量:', len(train_data))\n",
"file_id = 0\n",
"kfold = StratifiedKFold(n_splits=4).split(train_data, tag2_index)\n",
"for i, (train_idx, dev_idx) in enumerate(kfold):\n",
" train, dev = [train_data[i] for i in train_idx], [train_data[i] for i in dev_idx]\n",
" dev_type2 = [tag2_index[i] for i in dev_idx]\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/dev_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(dev)\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/train_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(train)\n",
" \n",
" print(f'================================样本{file_id}, train: ', len(train), 'dev: ', len(dev), 'dev_type2: ', sum(dev_type2))\n",
" print(dev[:1])\n",
" file_id += 1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类baseline,https://www.biendata.xyz/competition/sohu_2022/
# 方案:用实体在句子中首次出现的首尾平均池化,fgm + multi_dropout + cv,f1=0.67176
import numpy as np
import random
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.losses import FocalLoss
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
import random
import os
import argparse
import pickle
import warnings
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description='交叉验证')
parser.add_argument('--fileid', default=0)
parser.add_argument('--gpuid', default=0)
args = parser.parse_args()
fileid = args.fileid
gpuid = args.gpuid
# 配置设置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512_cv_{fileid}'
save_path = f'./output/section1{prefix}.txt'
save_path_dev = f'./output/dev{prefix}.txt'
ckpt_path = f'./ckpt/best_model{prefix}.pt'
device = f'cuda:{gpuid}' if torch.cuda.is_available() else 'cpu'
seed = 42
# 模型设置
epochs = 10
steps_per_epoch = 1000
total_eval_step = None
maxlen = 512
batch_size = 7
batch_size_eval = 64
categories = [-2, -1, 0, 1, 2]
categories_count = {k+1:0 for k in range(len(categories))}
# 固定seed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# 加载数据集
class MyDataset(ListDataset):
def __init__(self, file_path=None, data=None, mode='train'):
self.mode = mode
super().__init__(file_path, data)
def load_data(self, filename):
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
with open(filename, encoding='utf-8') as f:
for l in tqdm(f, desc=f'[Loading {self.mode} data]'):
taskData = json.loads(l.strip())
id = taskData['id']
# 按照最长长度和标点符号切分
for t in text_segmentate(taskData['content'], maxlen - 2, seps, strips):
entitys = []
# train
if isinstance(taskData['entity'], dict):
for ent, label in taskData['entity'].items():
start = self.search(ent, t)
if start != -1:
label = categories.index(label)+1
entitys.append((ent, start, start+len(ent)-1, label)) # +1是为了padding
categories_count[label] += 1
# test
elif isinstance(taskData['entity'], list):
for ent in taskData['entity']:
start = self.search(ent, t)
if start != -1:
entitys.append((ent, start, start+len(ent)-1, 0))
if entitys: # 如果存在实体
D.append((id, t, *entitys))
return D
def search(self, pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_extra, batch_token_ids, batch_entity_ids, batch_entity_labels = [], [], [], []
for d in batch:
id, contents, entities = d[0], d[1], d[2:]
tokens = tokenizer.tokenize(contents, maxlen=maxlen)[1:-1]
tokens = ['[CLS]'] + [j for i in tokens for j in i] + ['[SEP]'] # 转成char为单位的
mapping = tokenizer.rematch(contents, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
entity_ids, entity_labels, extra_map = [], [], {}
for ent, start, end, label in entities:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
entity_ids.append([start, end])
# # 验证边界id没有问题
# if ''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])) != ent.lower():
# print(''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])), ent)
entity_labels.append(label)
extra_map[(start, end)] = (ent, label)
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
entity_labels.append(0)
batch_extra.append((id, extra_map))
batch_token_ids.append(token_ids)
batch_entity_ids.append(entity_ids)
batch_entity_labels.append(entity_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids, batch_extra], batch_entity_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset(f'{data_dir}/train_{fileid}.txt'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'{data_dir}/dev_{fileid}.txt', mode='dev'), batch_size=batch_size_eval, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'{data_dir}/test.txt', mode='test'), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dropout = [nn.Dropout(0.1), nn.Dropout(0.3), nn.Dropout(0.5), nn.Dropout(0.7)]
self.dense = nn.Linear(768, 5+1) # 包含padding
def forward(self, inputs):
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
btz, entity_count, _ = entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logits = []
for dropout in self.dropout:
entity_logits.append(self.dense(dropout(entity_states)))
return entity_logits
model = Model().to(device)
print(categories_count)
class Loss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.loss_fn = FocalLoss(ignore_index=0)
def forward(self, entity_logits, labels):
loss = 0
for entity_logit in entity_logits:
loss += self.loss_fn(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=1e-5), adversarial_train={'name': 'fgm'})
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result, result_prob = dict(), dict()
for (token_ids, entity_ids, extra), entity_labels in tqdm(data):
entity_logit = model.predict([token_ids, entity_ids])[0] # [btz, 实体个数, 实体类别数]
entity_logit = F.softmax(entity_logit, dim=-1)
entity_prob, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果,entity_tuple是(smp_id, ent_id, start, end, label, prob)的列表
v_pred, entity_tuple = trans_entity2tuple(entity_ids, entity_pred, entity_prob)
v_true, _ = trans_entity2tuple(entity_ids, entity_labels)
valid_pred.extend(v_pred)
valid_true.extend(v_true)
# generate submit result
for id_, ent_id_, start, end, label_, prob in entity_tuple:
label_ = label_-3
smp_id, s_e_ents = extra[id_][0], extra[id_][1]
if (start, end) not in s_e_ents:
raise ValueError('entity missing')
if smp_id not in result:
result[smp_id], result_prob[smp_id] = {}, {}
ent_name = s_e_ents[(start, end)][0]
if ent_name in result[smp_id] and prob < result[smp_id][ent_name][-1]:
# 如果同一个实体
continue
else:
result[smp_id].update({ent_name: (label_, prob)})
ent_prob = entity_logit[id_][ent_id_].cpu().numpy()
result_prob[smp_id].update({ent_name: ent_prob})
assert prob == ent_prob[label_+3]
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result, result_prob
def trans_entity2tuple(entity_ids, entity_labels, entity_probs=None):
'''把tensor转为(样本id, start, end, 实体类型, 实体概率值)的tuple用于计算指标
'''
y, ent_tuple = [], []
for i, one_sample in enumerate(entity_ids): # 遍历样本
for j, item in enumerate(one_sample): # 遍历实体
if item[0].item() * item[1].item() != 0:
tmp = (i, j, item[0].item(), item[1].item(), entity_labels[i, j].item())
y.append(entity_labels[i, j].item())
ent_tuple.append(tmp if entity_probs is None else tmp + (entity_probs[i, j].item(),))
return y, ent_tuple
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result, pred_result_prob = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
# save_result(pred_result, pred_result_prob, save_path=save_path_dev)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
def save_result(result, result_prob, save_path):
result = [(key, value) for key, value in result.items()]
result.sort(key=lambda x: x[0])
result_str = 'id\tresult\n'
for key, value in result:
result_str += f'{key}\t{value}\n'
with open(save_path, 'w', encoding='utf-8') as f:
f.write(result_str)
# 保存概率
with open(save_path[:-4] + '_prob.pkl', 'wb') as f:
pickle.dump(result_prob, f)
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result, pred_result_prob = evaluate(test_dataloader)
save_result(pred_result, pred_result_prob, save_path=save_path)
\ No newline at end of file
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.tokenizers import Tokenizer, SpTokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import transformers
import random
from sklearn.metrics import f1_score, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")
# 配置设置
pretrain_model = 'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base'
config_path = pretrain_model + '/bert4torch_config.json'
checkpoint_path = pretrain_model + '/pytorch_model.bin'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512'
save_path = f'./section1{prefix}.txt'
save_path_dev = f'./dev{prefix}.txt'
ckpt_path = f'./best_model{prefix}.pt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
use_swa = False
use_adv_train = False
# 模型设置
epochs = 10
steps_per_epoch = None
total_eval_step = None
num_warmup_steps = 4000
maxlen = 900
batch_size = 6
batch_size_eval = 64
grad_accumulation_steps = 3
categories = [-2, -1, 0, 1, 2]
mask_symbol = '<mask>'
seed_everything(19260817) # 估计随机数
# 加载数据集
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in tqdm(f.readlines(), desc="Loading data"):
taskData = json.loads(l.strip())
text2 = ''.join([ent+mask_symbol for ent in taskData['entity'].keys()])
D.append((taskData['content'], text2, taskData['entity']))
return D
def search(tokens, search_token, start_idx=0):
mask_idxs = []
for i in range(len(tokens)):
if tokens[i] == search_token:
mask_idxs.append(i+start_idx)
return mask_idxs
# 建立分词器,这里使用transformer自带的
tokenizer = transformers.XLNetTokenizerFast.from_pretrained(pretrain_model)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_entity_ids, batch_entity_labels = [], [], [], []
for text, prompt, entity in batch:
inputs = tokenizer.__call__(text=text, text_pair=prompt, add_special_tokens=True, max_length=maxlen, truncation="only_first")
token_ids, segment_ids = inputs['input_ids'], inputs['token_type_ids']
ent_ids = search(token_ids, tokenizer.mask_token_id)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_entity_ids.append(ent_ids)
batch_entity_labels.append([categories.index(label) for label in entity.values()])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device)
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, value=-1), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_segment_ids, batch_entity_ids], batch_entity_labels
# 转换数据集
all_data = load_data(f'{data_dir}/train.txt')
random.shuffle(all_data)
split_index = 2000 # int(len(all_data)*0.9)
train_dataloader = DataLoader(ListDataset(data=all_data[split_index:]), batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=all_data[:split_index]), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='xlnet')
hidden_size = self.bert.configs['hidden_size']
self.classifier = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.LeakyReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, 5)
)
def forward(self, inputs):
token_ids, segment_ids, entity_ids = inputs
last_hidden_state = self.bert([token_ids, segment_ids]) # [btz, seq_len, hdsz]
entity_ids = entity_ids.unsqueeze(2).repeat(1, 1, last_hidden_state.shape[-1])
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids)
entity_logits = self.classifier(entity_states)
return entity_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, entity_logit, labels):
loss = super().forward(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps=len(train_dataloader)*epochs, last_epoch=-1)
model.compile(loss=Loss(ignore_index=-1), optimizer=optimizer, scheduler=scheduler, clip_grad_norm=1.0, adversarial_train={'name': 'fgm' if use_adv_train else ''})
# swa
if use_swa:
def average_function(ax: torch.Tensor, x: torch.Tensor, num: int) -> torch.Tensor:
return ax + (x - ax) / (num + 1)
swa_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=average_function)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
if use_swa:
swa_model.update_parameters(model)
@staticmethod
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result = dict()
for (token_ids, entity_ids), entity_labels in tqdm(data):
if use_swa:
swa_model.eval()
with torch.no_grad():
entity_logit = F.softmax(swa_model([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
else:
entity_logit = F.softmax(model.predict([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
_, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index = (entity_ids.flatten()>0).nonzero().squeeze(-1)
valid_pred.extend(entity_pred.flatten()[valid_index].cpu().tolist())
valid_true.extend(entity_labels.flatten()[valid_index].cpu().tolist())
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, grad_accumulation_steps=grad_accumulation_steps, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result = Evaluator.evaluate(valid_dataloader)
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import numpy as np
import json
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, seed_everything
from bert4torch.optimizers import get_linear_schedule_with_warmup
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")
# 配置设置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir = 'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice = 'train'
prefix = f'_char_512'
save_path = f'./section1{prefix}.txt'
save_path_dev = f'./dev{prefix}.txt'
ckpt_path = f'./best_model{prefix}.pt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
use_swa = True
# 模型设置
epochs = 10
steps_per_epoch = None
total_eval_step = None
num_warmup_steps = 4000
maxlen = 512
batch_size = 7
batch_size_eval = 64
categories = [-2, -1, 0, 1, 2]
seed_everything(42) # 估计随机数
# 加载数据集
def load_data(filename):
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
with open(filename, encoding='utf-8') as f:
for l in tqdm(f.readlines(), desc="Loading data"):
taskData = json.loads(l.strip())
text2 = ''.join([ent+'[MASK]' for ent in taskData['entity'].keys()]) + '[SEP]'
text2_len = sum([len(ent)+1 for ent in taskData['entity'].keys()]) + 1
for t in text_segmentate(taskData['content'], maxlen-text2_len-2, seps, strips):
D.append((t, text2, taskData['entity']))
return D
def search(tokens, start_idx=0):
mask_idxs = []
for i in range(len(tokens)):
if tokens[i] == '[MASK]':
mask_idxs.append(i+start_idx)
return mask_idxs
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_entity_ids, batch_entity_labels = [], [], []
for text1, text2, entity in batch:
token_ids1 = tokenizer.encode(text1)[0]
tokens2 = tokenizer.tokenize(text2)[1:-1]
token_ids2 = tokenizer.tokens_to_ids(tokens2)
ent_ids_raw = search(tokens2, start_idx=len(token_ids1))
# 不在原文中的实体,其[MASK]标记不用于计算loss
ent_labels, ent_ids = [], []
for i, (ent, label) in enumerate(entity.items()):
if ent in text1:
assert tokens2[ent_ids_raw[i]-len(token_ids1)] == '[MASK]'
ent_ids.append(ent_ids_raw[i])
ent_labels.append(categories.index(label))
batch_token_ids.append(token_ids1 + token_ids2)
batch_entity_ids.append(ent_ids)
batch_entity_labels.append(ent_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device)
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels, value=-1), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids], batch_entity_labels
# 转换数据集
all_data = load_data(f'{data_dir}/train.txt')
split_index = int(len(all_data)*0.9)
train_dataloader = DataLoader(ListDataset(data=all_data[:split_index]), batch_size=batch_size, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=all_data[split_index:]), batch_size=batch_size_eval, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
hidden_size = self.bert.configs['hidden_size']
self.classifier = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.LeakyReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, 5)
)
def forward(self, inputs):
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.unsqueeze(2).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids)
entity_logits = self.classifier(entity_states)
return entity_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, entity_logit, labels):
loss = super().forward(entity_logit.reshape(-1, entity_logit.shape[-1]), labels.flatten())
return loss
optimizer = optim.Adam(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps=len(train_dataloader)*epochs, last_epoch=-1)
model.compile(loss=Loss(ignore_index=-1), optimizer=optimizer, scheduler=scheduler, adversarial_train={'name': 'fgm'})
# swa
if use_swa:
def average_function(ax: torch.Tensor, x: torch.Tensor, num: int) -> torch.Tensor:
return ax + (x - ax) / (num + 1)
swa_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=average_function)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, acc, pred_result = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(ckpt_path)
print(f'[val-entity] f1: {f1:.5f}, acc: {acc:.5f} best_f1: {self.best_val_f1:.5f}\n')
if use_swa:
swa_model.update_parameters(model)
@staticmethod
def evaluate(data):
valid_true, valid_pred = [], []
eval_step = 0
result = dict()
for (token_ids, entity_ids), entity_labels in tqdm(data):
if use_swa:
swa_model.eval()
with torch.no_grad():
entity_logit = F.softmax(swa_model([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
else:
entity_logit = F.softmax(model.predict([token_ids, entity_ids]), dim=-1) # [btz, 实体个数, 实体类别数]
_, entity_pred = torch.max(entity_logit, dim=-1) # [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index = (entity_ids.flatten()>0).nonzero().squeeze(-1)
valid_pred.extend(entity_pred.flatten()[valid_index].cpu().tolist())
valid_true.extend(entity_labels.flatten()[valid_index].cpu().tolist())
eval_step += 1
if (total_eval_step is not None) and (eval_step >= total_eval_step):
break
valid_true = np.array(valid_true)
valid_pred = np.array(valid_pred)
f1 = f1_score(valid_true, valid_pred, average='macro')
acc = accuracy_score(valid_true, valid_pred)
print(classification_report(valid_true, valid_pred))
# 只保留label,不需要prob
for k, v in result.items():
result[k] = {i: j[0] for i, j in v.items()}
return f1, acc, result
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, callbacks=[evaluator])
model.load_weights(ckpt_path)
f1, acc, pred_result = Evaluator.evaluate(valid_dataloader)
# 天池新闻分类
比赛链接:https://tianchi.aliyun.com/competition/entrance/531810/introduction?lang=zh-cn
| 解决方案 | 说明 | 指标 |
| ---- | ---- | ---- |
| Top1 | [Github](https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION)| 正式赛f1=0.9735 |
| Top1复现 | bert以第1折交叉epoch=5初始化,1个epoch,seed=0, 1993, 2020三者融合 | 长期赛f1=0.9736 |
| Top1_bert4torch复现 | bert+attn+fgm+cv | 长期赛f1=0.9727, dev_5cv=(0.97083, 0.97074, 0.96914, 0.96892, 0.96613)|
## 文件说明
- convert.py: 将上述链接中的tensorflow权重转为pytorch的
- training.py: finetune训练代码
\ No newline at end of file
import torch
import tensorflow as tf
tf_path = 'E:/Github/天池新闻分类/top1/pre_models/bert_model.ckpt'
torch_state_dict = {}
mapping = {
'bert/embeddings/word_embeddings': 'bert.embeddings.word_embeddings.weight',
'bert/embeddings/token_type_embeddings': 'bert.embeddings.token_type_embeddings.weight',
'bert/embeddings/position_embeddings': 'bert.embeddings.position_embeddings.weight',
'bert/embeddings/LayerNorm/beta': 'bert.embeddings.LayerNorm.bias',
'bert/embeddings/LayerNorm/gamma': 'bert.embeddings.LayerNorm.weight',
# 'bert/pooler/dense/kernel': 'bert.pooler.dense.weight',
# 'bert/pooler/dense/bias': 'bert.pooler.dense.bias',
# 'cls/seq_relationship/output_weights': 'cls.seq_relationship.weight',
# 'cls/seq_relationship/output_bias': 'cls.seq_relationship.bias',
'cls/predictions/transform/dense/kernel': 'cls.predictions.transform.dense.weight##T',
'cls/predictions/transform/dense/bias': 'cls.predictions.transform.dense.bias',
'cls/predictions/transform/LayerNorm/beta': 'cls.predictions.transform.LayerNorm.bias',
'cls/predictions/transform/LayerNorm/gamma': 'cls.predictions.transform.LayerNorm.weight',
'cls/predictions/output_bias': 'cls.predictions.bias',
}
for i in range(12):
prefix = 'bert/encoder/layer_%d/' % i
prefix_i = f'bert.encoder.layer.%d.' % i
mapping.update({
prefix + 'attention/self/query/kernel': prefix_i + 'attention.self.query.weight##T',
prefix + 'attention/self/query/bias': prefix_i + 'attention.self.query.bias',
prefix + 'attention/self/key/kernel': prefix_i + 'attention.self.key.weight##T',
prefix + 'attention/self/key/bias': prefix_i + 'attention.self.key.bias',
prefix + 'attention/self/value/kernel': prefix_i + 'attention.self.value.weight##T',
prefix + 'attention/self/value/bias': prefix_i + 'attention.self.value.bias',
prefix + 'attention/output/dense/kernel': prefix_i + 'attention.output.dense.weight##T',
prefix + 'attention/output/dense/bias': prefix_i + 'attention.output.dense.bias',
prefix + 'attention/output/LayerNorm/beta': prefix_i + 'attention.output.LayerNorm.bias',
prefix + 'attention/output/LayerNorm/gamma': prefix_i + 'attention.output.LayerNorm.weight',
prefix + 'intermediate/dense/kernel': prefix_i + 'intermediate.dense.weight##T',
prefix + 'intermediate/dense/bias': prefix_i + 'intermediate.dense.bias',
prefix + 'output/dense/kernel': prefix_i + 'output.dense.weight##T',
prefix + 'output/dense/bias': prefix_i + 'output.dense.bias',
prefix + 'output/LayerNorm/beta': prefix_i + 'output.LayerNorm.bias',
prefix + 'output/LayerNorm/gamma': prefix_i + 'output.LayerNorm.weight',
})
for old_key, new_key in mapping.items():
try:
ts = tf.train.load_variable(tf_path, old_key)
if new_key.endswith('##T'):
torch_state_dict[new_key.rstrip('##T')] = torch.from_numpy(ts).T
else:
torch_state_dict[new_key] = torch.from_numpy(ts)
except:
print('Missing ', old_key)
torch.save(torch_state_dict, 'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin')
# 模型推理脚本
# cv逐一预测,按照dev的指标加权
from copyreg import pickle
from torch import device
from training import Model, collate_fn
import torch
from torch.utils.data import DataLoader
from bert4torch.snippets import ListDataset
import pandas as pd
from tqdm import tqdm
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16
def load_data(df):
"""加载数据。"""
D = list()
for _, row in df.iterrows():
text = row['text']
D.append((text, 0))
return D
df_test = pd.read_csv('E:/Github/天池新闻分类/data/test_a.csv', sep='\t')
df_test['text'] = df_test['text'].apply(lambda x: x.strip().split())
test_data = load_data(df_test)
dev_dataloader = DataLoader(ListDataset(data=test_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
f1_score = [0.97083, 0.97074, 0.96914, 0.96892, 0.96613]
y_pred_final = 0
for i in range(5):
model = Model().to(device)
model.load_weights(f'best_model_fold{i+1}.pt')
y_pred = []
for x, _ in tqdm(dev_dataloader, desc=f'evaluate_cv{i}'):
y_pred.append(model.predict(x).cpu().numpy())
# if len(y_pred) > 10:
# break
y_pred = np.concatenate(y_pred)
y_pred_final += y_pred * f1_score[i]
np.save(f'test_cv{i}_logit.npy', y_pred)
df_test = pd.DataFrame(y_pred_final.argmax(axis=1))
df_test.columns = ['label']
df_test.to_csv('submission.csv', index=False)
\ No newline at end of file
# 模型训练脚本
# 链接:https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
# 这里仅基于bert4torch实现了Top1解决方案中的finetune部分,直接使用了原作者的预训练权重转pytorch
import numpy as np
import pandas as pd
from bert4torch.models import build_transformer_model, BaseModel
from torch.utils.data import DataLoader
from bert4torch.snippets import sequence_padding, ListDataset, Callback, EarlyStopping
from bert4torch.tokenizers import Tokenizer
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch import nn, optim
from tqdm import tqdm
# BERT base
config_path = 'E:/Github/天池新闻分类/top1/pre_models/bert_config.json'
checkpoint_path = 'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin'
dict_path = 'E:/Github/天池新闻分类/top1/pre_models/vocab.txt'
device = f'cuda' if torch.cuda.is_available() else 'cpu'
n = 5 # Cross-validation
SEED = 2020
num_classes = 14
maxlen = 512
max_segment = 2
batch_size = 4
grad_accum_steps = 64
drop = 0.2
lr = 2e-5
epochs = 100
def load_data(df):
"""加载数据。"""
D = list()
for _, row in df.iterrows():
text = row['text']
label = row['label']
D.append((text, int(label)))
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def sentence_split(words):
"""句子截断。"""
document_len = len(words)
index = list(range(0, document_len, maxlen-2))
index.append(document_len)
segments = []
for i in range(len(index) - 1):
segment = words[index[i]: index[i + 1]]
assert len(segment) > 0
segment = tokenizer.tokens_to_ids(['[CLS]'] + segment + ['[SEP]'])
segments.append(segment)
assert len(segments) > 0
if len(segments) > max_segment:
segment_ = int(max_segment / 2)
return segments[:segment_] + segments[-segment_:]
else:
return segments
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = sentence_split(text)
token_ids = sequence_padding(token_ids, length=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append(label)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=max_segment), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, device=device)
return batch_token_ids, batch_labels
class Attention(nn.Module):
"""注意力层。"""
def __init__(self, hidden_size, **kwargs):
self.hidden_size = hidden_size
super().__init__(**kwargs)
self.weight = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.bias = nn.Parameter(torch.zeros(self.hidden_size))
self.query = nn.Linear(self.hidden_size, 1, bias=False)
def forward(self, x, mask):
'''x: [btz, max_segment, hdsz]
mask: [btz, max_segment, 1]
'''
mask = mask.squeeze(2) # [btz, max_segment]
# linear
key = self.weight(x) + self.bias # [btz, max_segment, hdsz]
# compute attention
outputs = self.query(key).squeeze(2) # [btz, max_segment]
outputs -= 1e32 * (1 - mask)
attn_scores = F.softmax(outputs, dim=-1)
attn_scores = attn_scores * mask
attn_scores = attn_scores.reshape(-1, 1, attn_scores.shape[-1]) # [btz, 1, max_segment]
outputs = torch.matmul(attn_scores, key).squeeze(1) # [btz, hdsz]
return outputs
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dropout1 = nn.Dropout(0.1)
self.dropout2 = nn.Dropout(0.1)
self.attn = Attention(768)
self.dense = nn.Linear(768, num_classes)
def forward(self, token_ids):
''' token_ids: [btz, max_segment, max_len]
'''
input_mask = torch.any(token_ids, dim=-1, keepdim=True).long() # [btz, max_segment, 1]
token_ids = token_ids.reshape(-1, token_ids.shape[-1]) # [btz*max_segment, max_len]
output = self.bert([token_ids])[:, 0] # [btz*max_segment, hdsz]
output = output.reshape((-1, max_segment, output.shape[-1])) # [btz, max_segment, hdsz]
output = output * input_mask
output = self.dropout1(output)
output = self.attn(output, input_mask)
output = self.dropout2(output)
output = self.dense(output)
return output
class Evaluator(Callback):
def __init__(self, model, dataloader, fold):
super().__init__()
self.model = model
self.dataloader = dataloader
self.best_val_f1 = 0.
self.fold = fold
def evaluate(self):
y_true, y_pred = list(), list()
for x, y in tqdm(self.dataloader, desc='evaluate'):
y_true.append(y.cpu().numpy())
y_pred.append(self.model.predict(x).argmax(axis=1).cpu().numpy())
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
return f1
def on_epoch_end(self, steps, epoch, logs=None):
val_f1 = self.evaluate()
if val_f1 > self.best_val_f1:
self.best_val_f1 = val_f1
self.model.save_weights(f'best_model_fold{self.fold}.pt')
logs['val_f1'] = val_f1 # 这个要设置,否则EarlyStopping不生效
print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}\n')
def do_train(df_train):
skf = StratifiedKFold(n_splits=n, random_state=SEED, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train['text'], df_train['label']), 1):
print(f'[Fold {fold}]')
train_data = load_data(df_train.iloc[trn_idx])
valid_data = load_data(df_train.iloc[val_idx])
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=valid_data), batch_size=batch_size, collate_fn=collate_fn)
model = Model().to(device)
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=lr), adversarial_train={'name': 'fgm'})
callbacks = [
Evaluator(model, valid_dataloader, fold),
EarlyStopping(monitor='val_f1', patience=5, verbose=1, mode='max'), # 需要在Evaluator后面
]
model.fit(
train_dataloader,
steps_per_epoch=None,
epochs=epochs,
grad_accumulation_steps=grad_accum_steps,
callbacks=callbacks
)
del model
if __name__ == '__main__':
df_train = pd.read_csv('E:/Github/天池新闻分类/data/train_set.csv', sep='\t')
df_train['text'] = df_train['text'].apply(lambda x: x.strip().split())
do_train(df_train)
#! -*- coding:utf-8 -*-
# 句子对分类任务,LCQMC数据集
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tensorboardX import SummaryWriter
maxlen = 128
batch_size = 64
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir='./summary') # prepare summary writer
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text1, text2, label = l.strip().split('\t')
D.append((text1, text2, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token_ids, segment_ids = tokenizer.encode(text1, text2, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return (batch_token_ids, batch_segment_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_batch_end(self, global_step, local_step, logs=None):
if global_step % 10 == 0:
writer.add_scalar(f"train/loss", logs['loss'], global_step)
val_acc = evaluate(valid_dataloader)
writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# valid_acc: 94.72, test_acc: 94.11
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir='./summary') # prepare summary writer
choice = 'train' # train表示训练,infer表示推理
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
def inference(texts):
'''单条样本推理
'''
for text in texts:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]
logit = model.predict([token_ids, segment_ids])
y_pred = torch.argmax(torch.softmax(logit, dim=-1)).cpu().numpy()
print(text, ' ----> ', y_pred)
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
inference(['我今天特别开心', '我今天特别生气'])
#! -*- coding:utf-8 -*-
# 情感分类任务,加载GAU-alpha权重
# 博客:https://kexue.fm/archives/9052
# 权重转换脚本:./convert_script/convert_GAU_alpha.py
# valid_acc: 95.25, test_acc: 94.46
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gau_alpha')
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
last_hidden_state = self.bert([token_ids, segment_ids])
output = self.dropout(last_hidden_state[:, 0, :])
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分析例子,利用MLM做 Zero-Shot/Few-Shot/Semi-Supervised Learning
# 参考项目:https://github.com/bojone/Pattern-Exploiting-Training
# 指标如下,由于没有固定随机化因子,因此下述指标可能略有波动
# zero-shot1: 0.8517/0.8437
# zero-shot2: 0.8811/0.8707
# few-shot: 0.8896/0.8910
# semi-sup: 0.9024/0.8948
import torch
import torch.nn as nn
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model
from torch.optim import Adam
import torch.nn.functional as F
from bert4torch.snippets import sequence_padding, ListDataset, Callback
from torch.utils.data import DataLoader
num_classes = 2
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
choice = 'semi-sup' # zero-shot1, zero-shot2, few-shot, semi-sup
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data')
valid_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data')
test_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data')
# 模拟标注和非标注数据
train_frac = 0.01 # 标注数据的比例
num_labeled = int(len(train_data) * train_frac)
unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]]
if choice == 'zero-shot2':
train_data = unlabeled_data # 仅使用无监督数据继续mlm预训练
elif choice == 'few-shot':
train_data = train_data[:num_labeled] # 仅使用少量监督数据
elif choice == 'semi-sup': # 少量监督数据和全量无监督数据做半监督
train_data = train_data[:num_labeled]
train_data = train_data + unlabeled_data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 对应的任务描述
prefix = u'很满意。'
mask_idx = 1
pos_id = tokenizer.token_to_id(u'很')
neg_id = tokenizer.token_to_id(u'不')
def random_masking(token_ids):
"""对输入进行随机mask
"""
rands = np.random.random(len(token_ids))
source, target = [], []
for r, t in zip(rands, token_ids):
if r < 0.15 * 0.8:
source.append(tokenizer._token_mask_id)
target.append(t)
elif r < 0.15 * 0.9:
source.append(t)
target.append(t)
elif r < 0.15:
source.append(np.random.choice(tokenizer._vocab_size - 1) + 1)
target.append(t)
else:
source.append(t)
target.append(0)
return source, target
class MyDataset(ListDataset):
def collate_fn(self, batch):
batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
for text, label in batch:
if label != 2:
text = prefix + text
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
if self.kwargs['random']:
source_ids, target_ids = random_masking(token_ids)
else:
source_ids, target_ids = token_ids[:], token_ids[:]
if label == 0:
source_ids[mask_idx] = tokenizer._token_mask_id
target_ids[mask_idx] = neg_id
elif label == 1:
source_ids[mask_idx] = tokenizer._token_mask_id
target_ids[mask_idx] = pos_id
batch_token_ids.append(source_ids)
batch_segment_ids.append(segment_ids)
batch_output_ids.append(target_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_output_ids = torch.tensor(sequence_padding(batch_output_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_output_ids
# 加载数据集
train_dataset = MyDataset(data=train_data, random=True)
valid_dataset = MyDataset(data=valid_data, random=False)
test_dataset = MyDataset(data=test_data, random=False)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)
# 加载预训练模型
model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True).to(device)
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_true):
y_pred = y_preds[1]
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
loss = super().forward(y_pred, y_true.flatten())
return loss
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=MyLoss(ignore_index=0),
optimizer=Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'[{choice}] valid_acc: {val_acc:.4f}, test_acc: {test_acc:.4f}, best_val_acc: {self.best_val_acc:.4f}\n')
@staticmethod
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = F.softmax(model.predict(x_true)[1], dim=-1)
y_pred = y_pred[:, mask_idx, [neg_id, pos_id]].argmax(axis=1)
y_true = (y_true[:, mask_idx] == pos_id).long()
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
if choice == 'zero-shot1':
valid_acc = evaluator.evaluate(valid_dataloader)
test_acc = evaluator.evaluate(test_dataloader)
print(f'[{choice}] valid_acc: {valid_acc:.4f}, test_acc: {test_acc:.4f}')
else:
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分析例子,利用MLM+P-tuning,目前示例是全部一起finetune未冻结
# 官方项目:https://github.com/THUDM/P-tuning
# 参考项目:https://github.com/bojone/P-tuning
# few-shot: 0.8953/0.8953
import torch
import torch.nn as nn
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from torch.optim import Adam
from bert4torch.snippets import sequence_padding, ListDataset, Callback
from torch.utils.data import DataLoader
from torchinfo import summary
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
choice = 'finetune_all' # finetune_all finetune_few
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data')
valid_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data')
test_data = load_data('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data')
# 模拟标注和非标注数据
train_frac = 0.01 # 标注数据的比例
num_labeled = int(len(train_data) * train_frac)
unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]]
train_data = train_data[:num_labeled]
# train_data = train_data + unlabeled_data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 对应的任务描述
mask_idx = 5
desc = ['[unused%s]' % i for i in range(1, 9)]
desc.insert(mask_idx - 1, '[MASK]')
desc_ids = [tokenizer.token_to_id(t) for t in desc]
pos_id = tokenizer.token_to_id(u'很')
neg_id = tokenizer.token_to_id(u'不')
def random_masking(token_ids):
"""对输入进行随机mask
"""
rands = np.random.random(len(token_ids))
source, target = [], []
for r, t in zip(rands, token_ids):
if r < 0.15 * 0.8:
source.append(tokenizer._token_mask_id)
target.append(t)
elif r < 0.15 * 0.9:
source.append(t)
target.append(t)
elif r < 0.15:
source.append(np.random.choice(tokenizer._vocab_size - 1) + 1)
target.append(t)
else:
source.append(t)
target.append(0)
return source, target
class MyDataset(ListDataset):
def collate_fn(self, batch):
batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
if label != 2:
token_ids = token_ids[:1] + desc_ids + token_ids[1:]
segment_ids = [0] * len(desc_ids) + segment_ids
if self.kwargs['random']:
source_ids, target_ids = random_masking(token_ids)
else:
source_ids, target_ids = token_ids[:], token_ids[:]
if label == 0:
source_ids[mask_idx] = tokenizer._token_mask_id
target_ids[mask_idx] = neg_id
elif label == 1:
source_ids[mask_idx] = tokenizer._token_mask_id
target_ids[mask_idx] = pos_id
batch_token_ids.append(source_ids)
batch_segment_ids.append(segment_ids)
batch_output_ids.append(target_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_output_ids = torch.tensor(sequence_padding(batch_output_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_output_ids
# 加载数据集
train_dataset = MyDataset(data=train_data, random=True)
valid_dataset = MyDataset(data=valid_data, random=False)
test_dataset = MyDataset(data=test_data, random=False)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_preds, y_true):
y_pred = y_preds[1]
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
loss = super().forward(y_pred, y_true.flatten())
return loss
if choice == 'finetune_few':
# 只训练这几个tokens权重这部分尚未调试好
class PtuningBERT(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True, tie_emb_prj_weight=True, custom_attention_mask=True)
for name, param in self.bert.named_parameters():
if ('word_embeddings' not in name) and ('mlmDecoder' not in name):
param.requires_grad = False # 冻结除了word_embedding层意外的其他层
def forward(self, token_ids, segment_ids):
embedding = self.bert.embeddings.word_embeddings(token_ids)
embedding_no_grad = embedding.detach()
mask = torch.ones(token_ids.shape[1], dtype=torch.long, device=token_ids.device)
mask[1:9] -= 1 # 只优化id为1~8的token
embedding[:, mask.bool()] = embedding_no_grad[:, mask.bool()]
attention_mask = (token_ids != tokenizer._token_pad_id)
return self.bert([embedding, segment_ids, attention_mask])
model = PtuningBERT().to(device)
summary(model, input_data=next(iter(train_dataloader))[0])
elif choice == 'finetune_all':
# 全部权重一起训练
model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=MyLoss(ignore_index=0),
optimizer=Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=6e-4),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'valid_acc: {val_acc:.4f}, test_acc: {test_acc:.4f}, best_val_acc: {self.best_val_acc:.4f}\n')
@staticmethod
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true)[1]
y_pred = y_pred[:, mask_idx, [neg_id, pos_id]].argmax(axis=1)
y_true = (y_true[:, mask_idx] == pos_id).long()
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类例子,加载albert_zh权重(https://github.com/brightmart/albert_zh)
# valid_acc: 94.46, test_acc: 93.98
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tensorboardX import SummaryWriter
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/albert_config_small_google.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir='./summary') # prepare summary writer
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
D.append((text, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path, checkpoint_path, model='albert', with_pool=True) # 建立模型,加载权重
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类例子,加electra权重
# valid_acc: 94.94, test_acc: 94.78
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
# 指定好model和对应的ckpt地址
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='electra')
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(768, 2)
def forward(self, token_ids, segment_ids):
hidden_states = self.bert([token_ids, segment_ids])
output = self.dropout(hidden_states[:, 0, :]) # 用hidden_states的首位,即[CLS]后接dense层
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment