Commit 92c75df1 authored by sunzhq2's avatar sunzhq2
Browse files

yidong infer init

parents
#! -*- coding:utf-8 -*-
# loss: 句向量concat后 (u, v, u-v, u*v) 走CrossEntropyLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import spearmanr
from tqdm import tqdm
import sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling, task_name = 'cls', 'ATEC' # debug使用
print('pooling: ', pooling, ' task_name: ', task_name)
assert task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
maxlen = 64 if task_name != 'PAWSX' else 128
batch_size = 32
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], int(l[2])))
return D
def collate_fn(batch):
batch_token1_ids, batch_token2_ids, batch_labels = [], [], []
for text1, text2, label in batch:
label = int(label > 2.5) if task_name == 'STS-B' else label
token1_ids, _ = tokenizer.encode(text1, maxlen=maxlen)
batch_token1_ids.append(token1_ids)
token2_ids, _ = tokenizer.encode(text2, maxlen=maxlen)
batch_token2_ids.append(token2_ids)
batch_labels.append([label])
batch_token1_ids = torch.tensor(sequence_padding(batch_token1_ids), dtype=torch.long, device=device)
batch_token2_ids = torch.tensor(sequence_padding(batch_token2_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return (batch_token1_ids, batch_token2_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls', concatenation_sent_rep=True, concatenation_sent_difference=True, concatenation_sent_multiplication=False):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.concatenation_sent_rep = concatenation_sent_rep
self.concatenation_sent_difference = concatenation_sent_difference
self.concatenation_sent_multiplication = concatenation_sent_multiplication
hidden_unit = 0
hidden_unit += 768*2 if self.concatenation_sent_rep else 0
hidden_unit += 768 if self.concatenation_sent_difference else 0
hidden_unit += 768 if self.concatenation_sent_multiplication else 0
self.fc = nn.Linear(hidden_unit, 2)
def forward(self, token1_ids, token2_ids):
hidden_state1, pooler1 = self.bert([token1_ids])
rep_a = get_pool_emb(hidden_state1, pooler1, token1_ids.gt(0).long(), self.pool_method)
hidden_state2, pooler2 = self.bert([token2_ids])
rep_b = get_pool_emb(hidden_state2, pooler2, token2_ids.gt(0).long(), self.pool_method)
vectors_concat = []
if self.concatenation_sent_rep:
vectors_concat.append(rep_a)
vectors_concat.append(rep_b)
if self.concatenation_sent_difference:
vectors_concat.append(torch.abs(rep_a - rep_b))
if self.concatenation_sent_multiplication:
vectors_concat.append(rep_a * rep_b)
vectors_concat = torch.cat(vectors_concat, dim=1)
return self.fc(vectors_concat)
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pooler, attention_mask, self.pool_method)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = self.evaluate(valid_dataloader)
test_consine = self.evaluate(test_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'valid_consine: {val_consine:.5f}, test_consine: {test_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
# 定义评价函数
def evaluate(self, data):
embeddings1, embeddings2, labels = [], [], []
for (batch_token1_ids, batch_token2_ids), batch_labels in tqdm(data, desc='Evaluate'):
embeddings1.append(model.predict(batch_token1_ids).cpu())
embeddings2.append(model.predict(batch_token2_ids).cpu())
labels.append(batch_labels)
embeddings1 = torch.cat(embeddings1).numpy()
embeddings2 = torch.cat(embeddings2).numpy()
labels = torch.cat(labels).cpu().numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=5,
steps_per_epoch=None,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# ContrastiveTensionLoss: 同一个sentence送入两个模型,pooling后的点积要大
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT | 30.65 | 44.50| 68.67 | 16.20 | 69.27 |
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import pearsonr, spearmanr
import copy
import random
from tqdm import tqdm
import numpy as np
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print(model_type, pooling, task_name, dropout_rate)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
# 加载训练数据集
def collate_fn(batch):
texts_list = [[] for _ in range(2)]
labels = []
pos_id = random.randint(0, len(batch)-1)
pos_token_ids, _ = tokenizer.encode(batch[pos_id], maxlen=maxlen)
texts_list[0].append(pos_token_ids)
texts_list[1].append(pos_token_ids)
labels.append(1)
for neg_id in range(len(batch)):
if neg_id == pos_id:
continue
elif random.random() < 0.5:
neg_token_ids, _ = tokenizer.encode(batch[neg_id], maxlen=maxlen)
texts_list[0].append(pos_token_ids)
texts_list[1].append(neg_token_ids)
labels.append(0)
else:
neg_token_ids, _ = tokenizer.encode(batch[neg_id], maxlen=maxlen)
texts_list[0].append(neg_token_ids)
texts_list[1].append(pos_token_ids)
labels.append(0)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
train_dataloader = DataLoader(ListDataset(data=train_texts), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.model1 = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.model2 = copy.deepcopy(self.model1)
self.pool_method = pool_method
def forward(self, token_ids_list):
token_ids1 = token_ids_list[0]
hidden_state1, pool_cls1 = self.model1([token_ids1])
embeddings_a = get_pool_emb(hidden_state1, pool_cls1, token_ids1.gt(0).long(), self.pool_method)
token_ids2 = token_ids_list[1]
hidden_state2, pool_cls2 = self.model2([token_ids2])
embeddings_b = get_pool_emb(hidden_state2, pool_cls2, token_ids2.gt(0).long(), self.pool_method)
return torch.matmul(embeddings_a[:, None], embeddings_b[:, :, None]).squeeze(-1).squeeze(-1) # [btz]
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.model1([token_ids])
output = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return output
model = Model(pool_method=pooling).to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.BCEWithLogitsLoss(reduction='mean'),
optimizer=optim.Adam(model.parameters(), lr=2e-5), # 用足够小的学习率
)
# 定义评价函数
def evaluate(data):
cosine_scores, labels = [], []
for (batch_token1_ids, batch_token2_ids), label in tqdm(data):
embeddings1 = model.encode(batch_token1_ids).cpu().numpy()
embeddings2 = model.encode(batch_token2_ids).cpu().numpy()
cosine_score = 1 - (paired_cosine_distances(embeddings1, embeddings2))
cosine_scores.append(cosine_score)
labels.append(label)
cosine_scores = np.concatenate(cosine_scores)
labels = torch.cat(labels).cpu().numpy()
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=5,
steps_per_epoch=None,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# loss: 对比学习损失(和simcse类似),只是用了两个模型而已
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT_In_Batch_Neg | 32.47 | 47.09| 68.56 | 27.50 | 74.00 |
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
import copy
import numpy as np
from tqdm import tqdm
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print(model_type, pooling, task_name, dropout_rate)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
# 加载训练数据集
def collate_fn(batch):
texts_list = [[] for _ in range(2)]
for text in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
texts_list[0].append(token_ids)
texts_list[1].append(token_ids)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.arange(texts_list[0].size(0), device=texts_list[0].device)
return texts_list, labels
train_dataloader = DataLoader(ListDataset(data=train_texts), shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls', scale=20.0):
super().__init__()
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.model1 = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.model2 = copy.deepcopy(self.model1)
self.pool_method = pool_method
self.scale = scale
def forward(self, token_ids_list):
token_ids = token_ids_list[0]
hidden_state1, pooler1 = self.model1([token_ids])
embeddings_a = get_pool_emb(hidden_state1, pooler1, token_ids.gt(0).long(), self.pool_method)
token_ids = token_ids_list[1]
hidden_state2, pooler2 = self.model2([token_ids])
embeddings_b = get_pool_emb(hidden_state2, pooler2, token_ids.gt(0).long(), self.pool_method)
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz]
return scores
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.model1([token_ids])
output = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return output
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = Model(pool_method=pooling).to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
# 定义评价函数
def evaluate(data):
cosine_scores, labels = [], []
for (batch_token1_ids, batch_token2_ids), label in tqdm(data):
embeddings1 = model.encode(batch_token1_ids).cpu().numpy()
embeddings2 = model.encode(batch_token2_ids).cpu().numpy()
cosine_score = 1 - (paired_cosine_distances(embeddings1, embeddings2))
cosine_scores.append(cosine_score)
labels.append(label)
cosine_scores = np.concatenate(cosine_scores)
labels = torch.cat(labels).cpu().numpy()
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=5,
steps_per_epoch=None,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.pt')
#! -*- coding: utf-8 -*-
# DiffCSE中文测试:model, electra部分的gennerator和discriminator都是用的同样的bert模型
# 源项目: https://github.com/voidism/DiffCSE
# 原项目是btz *2 来做mask
from bert4torch.snippets import sequence_padding
from tqdm import tqdm
import numpy as np
import scipy.stats
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import sequence_padding, Callback, get_pool_emb
from torch.utils.data import DataLoader
from torch import optim, nn
import torch
from bert4torch.snippets import ListDataset
import torch.nn.functional as F
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
# model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.3 # debug使用
print(model_type, pooling, task_name, dropout_rate)
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
lambda_weight = 0.05 # electra部分loss权重
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
def mask_tokens(inputs, special_tokens_mask=None):
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
mlm_probability = 0.3
special_tokens = {tokenizer._token_start_id, tokenizer._token_end_id, tokenizer._token_pad_id,
tokenizer._token_unk_id, tokenizer._token_mask_id}
inputs = inputs.clone()
labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = torch.full(labels.shape, mlm_probability)
if special_tokens_mask is None:
special_tokens_mask = [[val in special_tokens for val in smp] for smp in labels.tolist()]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = tokenizer._token_mask_id
# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(tokenizer._vocab_size, labels.shape, dtype=torch.long, device=device)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
# 加载训练数据集
def collate_fn(batch):
input_ids = []
for text in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
input_ids.append(token_ids)
input_ids.extend(input_ids)
input_ids = torch.tensor(sequence_padding(input_ids), dtype=torch.long, device=device)
labels = torch.arange(len(batch), device=device)
# mlm_inputs和mlm_outputs
mlm_inputs, mlm_labels = mask_tokens(input_ids)
attention_mask = input_ids.gt(0).long()
return [input_ids, mlm_inputs], [labels, mlm_labels, attention_mask]
train_dataloader = DataLoader(ListDataset(data=train_texts), shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 定义generator
generator = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate, with_mlm=True)
generator.to(device)
generator.eval()
class ProjectionMLP(nn.Module):
def __init__(self, hidden_size):
super().__init__()
in_dim = hidden_size
hidden_dim = hidden_size * 2
out_dim = hidden_size
affine=False
list_layers = [nn.Linear(in_dim, hidden_dim, bias=False),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(inplace=True)]
list_layers += [nn.Linear(hidden_dim, out_dim, bias=False),
nn.BatchNorm1d(out_dim, affine=affine)]
self.net = nn.Sequential(*list_layers)
def forward(self, x):
return self.net(x)
class Similarity(nn.Module):
"""
Dot product or cosine similarity
"""
def __init__(self, temp):
super().__init__()
self.temp = temp
self.cos = nn.CosineSimilarity(dim=-1)
self.record = None
self.pos_avg = 0.0
self.neg_avg = 0.0
def forward(self, x, y):
sim = self.cos(x, y)
self.record = sim.detach()
min_size = min(self.record.shape[0], self.record.shape[1])
num_item = self.record.shape[0] * self.record.shape[1]
self.pos_avg = self.record.diag().sum() / min_size
self.neg_avg = (self.record.sum() - self.record.diag().sum()) / (num_item - min_size)
return sim / self.temp
# 建立模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.mlp = ProjectionMLP(self.bert.configs['hidden_size'])
self.discriminator = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate)
self.electra_head = nn.Linear(self.bert.configs['hidden_size'], 2)
self.sim = Similarity(temp=0.05)
def forward(self, input_ids, mlm_inputs):
# 和ESimCSE一致的计算逻辑
attention_mask = input_ids.gt(0).long()
hidden_state1, pooler = self.bert([input_ids])
reps = get_pool_emb(hidden_state1, pooler, attention_mask, self.pool_method)
if self.pool_method == 'cls':
reps = self.mlp(reps)
batch_size = input_ids.shape[0]//2
embeddings_a = reps[:batch_size]
embeddings_b = reps[batch_size:]
scores = self.sim(embeddings_a.unsqueeze(1), embeddings_b.unsqueeze(0)) # [btz, btz]
# Calculate loss for conditional ELECTRA
with torch.no_grad():
g_pred = generator([mlm_inputs])[1].argmax(-1) # [btz, seq_len]
g_pred[:, 0] = tokenizer._token_start_id
e_labels = (g_pred != input_ids) * attention_mask
e_inputs = g_pred * attention_mask
# cls位置需要用句向量替换
embeddings = self.discriminator.apply_embeddings([e_inputs])
embeddings[0] = torch.cat([reps.unsqueeze(1), embeddings[0][:, 1:, :]], dim=1)
outputs = self.discriminator.apply_main_layers(embeddings)
mlm_outputs = self.discriminator.apply_final_layers(outputs)
prediction_scores = self.electra_head(mlm_outputs)
return scores, prediction_scores, e_labels
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
output = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return output
class MyLoss(nn.Module):
def forward(self, model_outputs, model_labels):
scores, prediction_scores, e_labels = model_outputs
labels, mlm_labels, attention_mask = model_labels
# 这里不适用mlm_labels,mlm_labels主要是用于generator算loss,本方法generator是不参加训练的
loss_simcse = F.cross_entropy(scores, labels)
loss_electra = lambda_weight * F.cross_entropy(prediction_scores.view(-1, 2), e_labels.view(-1))
return {'loss': loss_simcse+loss_electra, 'loss_simcse': loss_simcse, 'loss_electra': loss_electra}
def cal_metric(model_outputs, model_labels):
scores, prediction_scores, e_labels = model_outputs
labels, mlm_labels, attention_mask = model_labels
rep = (e_labels == 1) * attention_mask
fix = (e_labels == 0) * attention_mask
prediction = prediction_scores.argmax(-1)
result = {}
result['electra_rep_acc'] = float((prediction*rep).sum()/rep.sum())
result['electra_fix_acc'] = float(1.0 - (prediction*fix).sum()/fix.sum())
result['electra_acc'] = float(((prediction == e_labels) * attention_mask).sum()/attention_mask.sum())
return result
model = Model(pool_method=pooling).to(device)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), 7e-6), metrics=cal_metric)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
def evaluate(dataloader):
# 模型预测
# 标准化,相似度,相关系数
sims_list, labels = [], []
for (a_token_ids, b_token_ids), label in tqdm(dataloader):
a_vecs = model.encode(a_token_ids)
b_vecs = model.encode(b_token_ids)
a_vecs = torch.nn.functional.normalize(a_vecs, p=2, dim=1).cpu().numpy()
b_vecs = torch.nn.functional.normalize(b_vecs, p=2, dim=1).cpu().numpy()
sims = (a_vecs * b_vecs).sum(axis=1)
sims_list.append(sims)
labels.append(label.cpu().numpy())
corrcoef = scipy.stats.spearmanr(np.concatenate(labels), np.concatenate(sims_list)).correlation
return corrcoef
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=5, callbacks=[evaluator])
#! -*- coding: utf-8 -*-
# ESimCSE 中文测试
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | ESimCSE | 34.05 | 50.54| 71.58 | 12.53 | 71.27 |
from bert4torch.snippets import sequence_padding
from tqdm import tqdm
import numpy as np
import scipy.stats
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import sequence_padding, Callback, get_pool_emb
from torch.utils.data import DataLoader
from torch import optim, nn
import torch
import random
import copy
import sys
from bert4torch.snippets import ListDataset
import jieba
jieba.initialize()
class CollateFunc(object):
'''对句子进行复制,和抽取负对
'''
def __init__(self, tokenizer, max_len=256, q_size=160, dup_rate=0.15):
self.q = []
self.q_size = q_size
self.max_len = max_len
self.dup_rate = dup_rate
self.tokenizer = tokenizer
def word_repetition(self, batch_text, pre_tokenize=False):
dst_text = list()
for text in batch_text:
if pre_tokenize:
cut_text = jieba.cut(text, cut_all=False)
text = list(cut_text)
actual_len = len(text)
dup_len = random.randint(a=0, b=max(2, int(self.dup_rate * actual_len)))
try:
dup_word_index = random.sample(list(range(1, actual_len)), k=dup_len)
except:
dup_word_index = set()
dup_text = ''
for index, word in enumerate(text):
dup_text += word
if index in dup_word_index:
dup_text += word
dst_text.append(dup_text)
return dst_text
def negative_samples(self, batch_src_text):
batch_size = len(batch_src_text)
negative_samples = None
if len(self.q) > 0:
negative_samples = self.q[:self.q_size]
# print("size of negative_samples", len(negative_samples))
if len(self.q) + batch_size >= self.q_size:
del self.q[:batch_size]
self.q.extend(batch_src_text)
return negative_samples
def __call__(self, batch_text):
'''
input: batch_text: [batch_text,]
output: batch_src_text, batch_dst_text, batch_neg_text
'''
batch_pos_text = self.word_repetition(batch_text)
batch_neg_text = self.negative_samples(batch_text)
# print(len(batch_pos_text))
batch_tokens_list, batch_pos_tokens_list = [], []
for text, text_pos in zip(batch_text, batch_pos_text):
batch_tokens_list.append(self.tokenizer.encode(text, maxlen=maxlen)[0])
batch_pos_tokens_list.append(self.tokenizer.encode(text_pos, maxlen=maxlen)[0])
batch_neg_tokens_list = []
if batch_neg_text:
for text in batch_neg_text:
batch_neg_tokens_list.append(self.tokenizer.encode(text, maxlen=maxlen)[0])
batch_tokens_list = torch.tensor(sequence_padding(batch_tokens_list), dtype=torch.long, device=device)
batch_pos_tokens_list = torch.tensor(sequence_padding(batch_pos_tokens_list), dtype=torch.long, device=device)
labels = torch.arange(batch_tokens_list.size(0), device=batch_tokens_list.device)
if batch_neg_tokens_list:
batch_neg_tokens_list = torch.tensor(sequence_padding(batch_neg_tokens_list), dtype=torch.long, device=device)
return [batch_tokens_list, batch_pos_tokens_list, batch_neg_tokens_list], labels
else:
return [batch_tokens_list, batch_pos_tokens_list], labels
# =============================基本参数=============================
model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'STS-B', 0.3 # debug使用
print(model_type, pooling, task_name, dropout_rate)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
train_call_func = CollateFunc(tokenizer, max_len=maxlen, q_size=64, dup_rate=0.15)
train_dataloader = DataLoader(ListDataset(data=train_texts), shuffle=True, batch_size=batch_size, collate_fn=train_call_func)
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 建立模型
class Model(BaseModel):
def __init__(self, pool_method='cls', scale=20.0):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.encoder = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.momentum_encoder = copy.deepcopy(self.encoder)
self.scale = scale
def forward(self, token_ids_list):
reps = []
for token_ids in token_ids_list[:2]:
hidden_state1, pooler = self.encoder([token_ids])
rep = get_pool_emb(hidden_state1, pooler, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
if len(token_ids_list) == 3: # 负样本
hidden_state1, pooler = self.momentum_encoder([token_ids_list[2]])
rep = get_pool_emb(hidden_state1, pooler, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz]
return scores
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.encoder([token_ids])
output = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return output
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = Model(pool_method=pooling).to(device)
class Momentum(object):
''' 动量更新,这里用scheduler来实现,因为是在optimizer.step()后来调用的
'''
def __init__(self, gamma=0.95) -> None:
self.gamma = gamma
def step(self):
for encoder_param, moco_encoder_param in zip(model.encoder.parameters(), model.momentum_encoder.parameters()):
moco_encoder_param.data = self.gamma * moco_encoder_param.data + (1. - self.gamma) * encoder_param.data
model.compile(loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), 1e-5),
scheduler=Momentum(gamma=0.95))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
def evaluate(dataloader):
# 模型预测
# 标准化,相似度,相关系数
sims_list, labels = [], []
for (a_token_ids, b_token_ids), label in tqdm(dataloader):
a_vecs = model.encode(a_token_ids)
b_vecs = model.encode(b_token_ids)
a_vecs = torch.nn.functional.normalize(a_vecs, p=2, dim=1).cpu().numpy()
b_vecs = torch.nn.functional.normalize(b_vecs, p=2, dim=1).cpu().numpy()
sims = (a_vecs * b_vecs).sum(axis=1)
sims_list.append(sims)
labels.append(label.cpu().numpy())
corrcoef = scipy.stats.spearmanr(np.concatenate(labels), np.concatenate(sims_list)).correlation
return corrcoef
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=5, callbacks=[evaluator])
#! -*- coding: utf-8 -*-
# promptbert实现sentence embedding
# 官方项目:https://github.com/kongds/Prompt-BERT
# 参考项目:https://github.com/Macielyoung/sentence_representation_matching
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | PromptBert | 33.98 | 49.89| 73.18 | 13.30 | 73.42 |
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import ListDataset, sequence_padding, Callback
from torch.utils.data import DataLoader
from scipy.stats import pearsonr, spearmanr
import numpy as np
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
model_type, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, task_name, dropout_rate = 'BERT', 'ATEC', 0.3 # debug使用
print(model_type, task_name, dropout_rate)
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
template_len = 15
if task_name == 'PAWSX':
maxlen = 128 + template_len
else:
maxlen = 64 + template_len
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False), add_special_tokens='[X]')
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True, add_special_tokens='[X]')
replace_token = "[X]"
mask_token = "[MASK]"
prompt_templates = ['"{}" 的意思为[MASK]'.format(replace_token), '"{}"这句话的意思是[MASK]'.format(replace_token)]
tao = 0.05
token_dict = load_vocab(dict_path)
compound_tokens = [[len(token_dict)]]
token_dict['[X]'] = len(token_dict)
# 加载数据集
def load_data(filenames):
D = []
for filename in filenames:
with open(filename, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines(), desc='Load data'):
cache = line.split('\t')
text1, text2, label = cache[0][:maxlen-template_len], cache[1][:maxlen-template_len], cache[-1]
for text in [text1, text2]:
sentence_pair = []
for template in prompt_templates:
sent_num = len(tokenizer.tokenize(text))
prompt_sent = template.replace(replace_token, text)
template_sent = template.replace(replace_token, replace_token * sent_num)
sentence_pair.extend([prompt_sent, template_sent])
D.append((sentence_pair, int(label)))
return D
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
train_texts = load_data(all_names)
valid_texts = list(zip(train_texts[::2], train_texts[1::2]))
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
# 加载训练数据集
def collate_fn(batch):
batch_tensor = [[] for _ in range(4)]
for prompt_data, _ in batch:
for i, item in enumerate(prompt_data):
batch_tensor[i].append(tokenizer.encode(item, maxlen=maxlen)[0])
for i, item in enumerate(batch_tensor):
batch_tensor[i] = torch.tensor(sequence_padding(item, maxlen), dtype=torch.long, device=device)
labels = torch.arange(batch_tensor[0].size(0), device=device)
return batch_tensor, labels
train_dataloader = DataLoader(ListDataset(data=train_texts), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_test(batch):
text1_ids, text2_ids, labels = [], [], []
for text1, text2 in batch:
label = text1[-1]
text1, text2 = text1[0][0], text2[0][0]
text1_ids.append(tokenizer.encode(text1, maxlen=maxlen)[0])
text2_ids.append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
text1_ids = torch.tensor(sequence_padding(text1_ids), dtype=torch.long, device=device)
text2_ids = torch.tensor(sequence_padding(text2_ids), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.long, device=device)
return [text1_ids, text2_ids], labels
valid_dataloader = DataLoader(ListDataset(data=valid_texts), batch_size=batch_size, collate_fn=collate_fn_test)
# =============================定义模型=============================
class PromptBert(BaseModel):
def __init__(self, scale=20.0):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=model_name,
dropout_rate=dropout_rate, segment_vocab_size=0, compound_tokens=compound_tokens)
self.scale = scale
def forward(self, prompt0_input, template0_input, prompt1_input, template1_input):
embeddings_a = self.get_sentence_embedding(prompt0_input, template0_input)
embeddings_b = self.get_sentence_embedding(prompt1_input, template1_input)
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz]
return scores
def get_sentence_embedding(self, prompt_input_ids, template_input_ids):
prompt_mask_embedding = self.get_mask_embedding(prompt_input_ids)
template_mask_embedding = self.get_mask_embedding(template_input_ids)
# 在计算损失函数时为了消除Prompt模板影响,通过替换模板后的句子[MASK]获取的表征减去模板中[MASK]获取的表征来得到句子向量表征
sentence_embedding = prompt_mask_embedding - template_mask_embedding
return sentence_embedding
def get_mask_embedding(self, input_ids):
last_hidden_state = self.bert([input_ids])
mask_index = (input_ids == tokenizer._token_mask_id).long()
input_mask_expanded = mask_index.unsqueeze(-1).expand(last_hidden_state.size()).float()
mask_embedding = torch.sum(last_hidden_state * input_mask_expanded, 1)
return mask_embedding
def predict(self, input_ids):
self.eval()
with torch.no_grad():
mask_embedding = self.get_mask_embedding(input_ids)
return mask_embedding
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = PromptBert().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_sim = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_sim = self.evaluate(valid_dataloader)
if val_sim > self.best_val_sim:
self.best_val_sim = val_sim
# model.save_weights('best_model.pt')
print(f'val_sim: {val_sim:.5f}, best_val_sim: {self.best_val_sim:.5f}\n')
@staticmethod
def evaluate(data):
embeddings1, embeddings2, labels = [], [], []
for (text1_ids, text2_ids), label in data:
embeddings1.append(model.predict(text1_ids))
embeddings2.append(model.predict(text2_ids))
labels.append(label)
embeddings1 = torch.cat(embeddings1)
embeddings2 = torch.cat(embeddings2)
labels = torch.cat(labels)
sims = F.cosine_similarity(embeddings1, embeddings2).cpu().numpy()
labels = labels.cpu().numpy()
return spearmanr(sims, labels)[0]
if __name__ == "__main__":
evaluator = Evaluator()
model.fit(train_dataloader, epochs=5, steps_per_epoch=None, callbacks=[evaluator])
#! -*- coding: utf-8 -*-
# SimCSE 中文测试
# bert4keras链接:https://kexue.fm/archives/8348
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | SimCSE | 33.90 | 50.29| 71.81 | 13.14 | 71.09 |
from bert4torch.snippets import sequence_padding
from tqdm import tqdm
import numpy as np
import scipy.stats
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import sequence_padding, Callback, get_pool_emb
from torch.utils.data import DataLoader
from torch import optim, nn
import torch
from bert4torch.snippets import ListDataset
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.3 # debug使用
print(model_type, pooling, task_name, dropout_rate)
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
# 加载训练数据集
def collate_fn(batch):
texts_list = [[] for _ in range(2)]
for text in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
texts_list[0].append(token_ids)
texts_list[1].append(token_ids)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.arange(texts_list[0].size(0), device=texts_list[0].device)
return texts_list, labels
train_dataloader = DataLoader(ListDataset(data=train_texts), shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 建立模型
class Model(BaseModel):
def __init__(self, pool_method='cls', scale=20.0):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.scale = scale
def forward(self, token_ids_list):
reps = []
for token_ids in token_ids_list:
hidden_state1, pooler = self.bert([token_ids])
rep = get_pool_emb(hidden_state1, pooler, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz]
return scores
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
output = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return output
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = Model(pool_method=pooling).to(device)
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), 1e-5))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
def evaluate(dataloader):
# 模型预测
# 标准化,相似度,相关系数
sims_list, labels = [], []
for (a_token_ids, b_token_ids), label in tqdm(dataloader):
a_vecs = model.encode(a_token_ids)
b_vecs = model.encode(b_token_ids)
a_vecs = torch.nn.functional.normalize(a_vecs, p=2, dim=1).cpu().numpy()
b_vecs = torch.nn.functional.normalize(b_vecs, p=2, dim=1).cpu().numpy()
sims = (a_vecs * b_vecs).sum(axis=1)
sims_list.append(sims)
labels.append(label.cpu().numpy())
corrcoef = scipy.stats.spearmanr(np.concatenate(labels), np.concatenate(sims_list)).correlation
return corrcoef
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=None, epochs=5, callbacks=[evaluator])
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# 一个encoder输入删减后的句子生成句向量,decoder依据这个句子向量来恢复原句
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B | comment |
# | TSDAE | —— | 46.65| 65.30 | 12.54 | —— | ——表示该指标异常未记录 |
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import pearsonr, spearmanr
import numpy as np
import re
from tqdm import tqdm
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print(model_type, pooling, task_name, dropout_rate)
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
dropout_rate = float(dropout_rate)
batch_size = 32
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
def load_data(filenames):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
return D
all_texts = load_data(all_names)
train_texts = [j for i in all_texts for j in i[:2]]
if task_name != 'PAWSX':
np.random.shuffle(train_texts)
train_texts = train_texts[:10000]
# 加载训练数据集
def collate_fn(batch):
def add_noise(token_ids, del_ratio=0.6):
n = len(token_ids)
keep_or_not = np.random.rand(n) > del_ratio
if sum(keep_or_not) == 0:
keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
return list(np.array(token_ids)[keep_or_not])
texts_list = [[] for _ in range(3)]
for text in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
texts_list[0].append([tokenizer._token_start_id] + add_noise(token_ids[1:-1]) + [tokenizer._token_end_id])
texts_list[1].append(token_ids[:-1])
texts_list[2].append(token_ids[1:])
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
return texts_list[:2], texts_list[2].flatten()
train_dataloader = DataLoader(ListDataset(data=train_texts), shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
# 加载测试数据集
def collate_fn_eval(batch):
texts_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
texts_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
texts_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.float, device=device)
return texts_list, labels
valid_dataloader = DataLoader(ListDataset(data=all_texts), batch_size=batch_size, collate_fn=collate_fn_eval)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.encoder = build_transformer_model(config_path, checkpoint_path, model=model_name, segment_vocab_size=0, dropout_rate=dropout_rate,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
# 用bert的权重来初始化decoder,crossAttn部分是随机初始化的
self.decoder = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=model_name, application='lm', dropout_rate=dropout_rate,
output_all_encoded_layers=output_all_encoded_layers, is_decoder=True, segment_vocab_size=0)
self.pool_method = pool_method
# 绑定encoder和decoder的权重
decoder_names = {k for k, _ in self.decoder.named_parameters()}
for enc_k, v in self.encoder.named_parameters():
dec_k = enc_k
if dec_k in decoder_names:
rep_str = f'self.encoder.{enc_k} = self.decoder.{dec_k}'
if re.search('\.[0-9]+\.', rep_str):
temp = '[' + re.findall('\.[0-9]+\.', rep_str)[0][1:-1] + '].'
rep_str = re.sub('\.[0-9]+\.', temp, rep_str)
exec(rep_str)
else:
print(enc_k, dec_k)
def forward(self, token_ids_list):
token_ids1 = token_ids_list[0]
hidden_state1, pool_cls1 = self.encoder([token_ids1])
embeddings_a = get_pool_emb(hidden_state1, pool_cls1, token_ids1.gt(0).long(), self.pool_method)
token_ids2 = token_ids_list[1]
encoder_embedding = embeddings_a.unsqueeze(1)
encoder_attention_mask = torch.ones_like(token_ids1)[:, 0:1][:, None, None, :]
_, logits = self.decoder([token_ids2, encoder_embedding, encoder_attention_mask])
return logits.reshape(-1, logits.shape[-1])
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.encoder([token_ids])
output = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return output
model = Model(pool_method=pooling).to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(ignore_index=0),
optimizer=optim.Adam(model.parameters(), lr=2e-4),
)
# 定义评价函数
def evaluate(data):
cosine_scores, labels = [], []
for (batch_token1_ids, batch_token2_ids), label in tqdm(data):
embeddings1 = model.encode(batch_token1_ids).cpu().numpy()
embeddings2 = model.encode(batch_token2_ids).cpu().numpy()
cosine_score = 1 - (paired_cosine_distances(embeddings1, embeddings2))
cosine_scores.append(cosine_score)
labels.append(label)
cosine_scores = np.concatenate(cosine_scores)
labels = torch.cat(labels).cpu().numpy()
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=5,
steps_per_epoch=None,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert_whitening
# 官方项目:https://github.com/bojone/BERT-whitening
# cls+不降维
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | Bert-whitening | 26.79 | 31.81| 56.34 | 17.22 | 67.45 |
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, get_pool_emb
from bert4torch.layers import BERT_WHITENING
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import numpy as np
import scipy.stats
import sys
import jieba
jieba.initialize()
# =============================基本参数=============================
# model_type, pooling, task_name, n_components = sys.argv[1:] # 传入参数
model_type, pooling, task_name, n_components = 'BERT', 'cls', 'ATEC', -1 # debug使用
print(model_type, pooling, task_name, n_components)
assert model_type in {'BERT', 'RoBERTa', 'NEZHA', 'RoFormer', 'SimBERT'}
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
assert task_name in {'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B'}
if model_type in {'BERT', 'RoBERTa', 'SimBERT'}:
model_name = 'bert'
elif model_type in {'RoFormer'}:
model_name = 'roformer'
elif model_type in {'NEZHA'}:
model_name = 'nezha'
n_components = int(n_components)
if n_components < 0:
if model_type.endswith('large'):
n_components = 1024
elif model_type.endswith('tiny'):
n_components = 312
elif model_type.endswith('small'):
n_components = 384
else:
n_components = 768
batch_size = 128
if task_name == 'PAWSX':
maxlen = 128
else:
maxlen = 64
# bert配置
model_dir = {
'BERT': 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12',
'RoBERTa': 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base',
'NEZHA': 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base',
'RoFormer': 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base',
'SimBERT': 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base',
}[model_type]
config_path = f'{model_dir}/bert_config.json' if model_type == 'BERT' else f'{model_dir}/config.json'
checkpoint_path = f'{model_dir}/pytorch_model.bin'
dict_path = f'{model_dir}/vocab.txt'
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# =============================加载数据集=============================
# 建立分词器
if model_type in ['RoFormer']:
tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.lcut(s, HMM=False))
else:
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 读数据
all_names = [f'{data_path}{task_name}/{task_name}.{f}.data' for f in ['train', 'valid', 'test']]
print(all_names)
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], float(l[2])))
# if len(D) > 1000:
# break
return D
def collate_fn(batch):
batch_token1_ids, batch_token2_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token1_ids, _ = tokenizer.encode(text1, maxlen=maxlen)
batch_token1_ids.append(token1_ids)
token2_ids, _ = tokenizer.encode(text2, maxlen=maxlen)
batch_token2_ids.append(token2_ids)
batch_labels.append([label])
batch_token1_ids = torch.tensor(sequence_padding(batch_token1_ids), dtype=torch.long, device=device)
batch_token2_ids = torch.tensor(sequence_padding(batch_token2_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.float, device=device)
return (batch_token1_ids, batch_token2_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(all_names), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='mean'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.pool_method = pool_method
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pool_cls, attention_mask, self.pool_method)
return output
model = Model().to(device)
# 提取训练集的所有句向量
sen_emb_list, sen_labels = [], []
for token_ids, labels in tqdm(train_dataloader, desc='Encoding'):
sen1_emb = model.encode(token_ids[0])
sen2_emb = model.encode(token_ids[1])
sen_emb_list.append((sen1_emb, sen2_emb))
sen_labels.append(labels)
# 调用bert_whitening模块
bert_whitening = BERT_WHITENING()
if n_components > 0:
bert_whitening.compute_kernel_bias([v for vecs in sen_emb_list for v in vecs])
bert_whitening.kernel = bert_whitening.kernel[:, :n_components]
# 变换,标准化,相似度,相关系数
all_sims = []
for (a_vecs, b_vecs) in tqdm(sen_emb_list, desc='Transform'):
a_vecs = bert_whitening.transform_and_normalize(a_vecs)
b_vecs = bert_whitening.transform_and_normalize(b_vecs)
sims = (a_vecs * b_vecs).sum(axis=1)
all_sims.append(sims)
all_sims = torch.cat(all_sims, dim=0)
sen_labels = torch.cat(sen_labels, dim=0)
corrcoef = scipy.stats.spearmanr(sen_labels.cpu().numpy(), all_sims.cpu().numpy()).correlation
print(f'{task_name} corrcoefs: ', corrcoef)
\ No newline at end of file
#! -*- coding: utf-8 -*-
# KgCLUE baseline
# 直接用UniLM做Seq2Seq,然后前缀树约束解码,并加入自研的“前瞻”策略;
# 基础模型为RoFormer-Sim-FT,相比直接用RoFormer/BERT/RoBERTa有2%的提升;
# 介绍链接:https://kexue.fm/archives/8802
import os, json
import numpy as np
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch.optim as optim
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from bert4torch.snippets import ListDataset, sequence_padding, AutoRegressiveDecoder, Callback
from tqdm import tqdm
from collections import defaultdict
# import pylcs
def lcs(source, target):
"""最长公共子序列(source和target的最长非连续子序列)
返回:子序列长度, 映射关系(映射对组成的list)
注意:最长公共子序列可能不止一个,所返回的映射只代表其中一个。
"""
c = defaultdict(int)
for i, si in enumerate(source, 1):
for j, tj in enumerate(target, 1):
if si == tj:
c[i, j] = c[i - 1, j - 1] + 1
elif c[i, j - 1] > c[i - 1, j]:
c[i, j] = c[i, j - 1]
else:
c[i, j] = c[i - 1, j]
l, mapping = c[len(source), len(target)], []
i, j = len(source) - 1, len(target) - 1
while len(mapping) < l:
if source[i] == target[j]:
mapping.append((i, j))
i, j = i - 1, j - 1
elif c[i + 1, j] > c[i, j + 1]:
j = j - 1
else:
i = i - 1
return l, mapping[::-1]
def subject_split(s):
"""如果有义项,那么单独分离出来
"""
m = ''
if s[-1] == u')':
i = s.index(u'(')
m = s[i + 1:-1]
s = s[:i]
return s, m
def load_data(filename):
"""读取数据集
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
s, p, o = l['answer'].split(' ||| ')
s, m = subject_split(s)
D.append((l['question'], (s, p, m, ' '.join(o.split()))))
return D
class Trie(object):
"""自定义Trie树对象,用来保存知识库
"""
def __init__(self, value_key=-1):
self.data = {}
self.value_key = str(value_key)
def __setitem__(self, key, value):
"""传入一对(key, value)到前缀树中
"""
data = self.data
for k in key:
k = str(k)
if k not in data:
data[k] = {}
data = data[k]
if self.value_key in data:
if data[self.value_key] != value:
data[self.value_key] += ('\t' + value)
else:
data[self.value_key] = value
def __getitem__(self, key):
"""获取key对应的value
"""
data = self.data
for k in key:
k = str(k)
data = data[k]
return data[self.value_key]
def next_ones(self, prefix):
"""获取prefix后一位的容许集
"""
data = self.data
for k in prefix:
k = str(k)
data = data[k]
return [k for k in data if k != self.value_key]
def keys(self, prefix=None, data=None):
"""获取以prefix开头的所有key
"""
data = data or self.data
prefix = prefix or []
for k in prefix:
k = str(k)
if k not in data:
return []
data = data[k]
results = []
for k in data:
if k == self.value_key:
results.append([])
else:
results.extend([[k] + j for j in self.keys(None, data[k])])
return [prefix + i for i in results]
def save(self, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(self.data, f, ensure_ascii=False)
def load(self, filename):
with open(filename, encoding='utf-8') as f:
self.data = json.load(f)
# 基本参数
maxlen = 128
batch_size = 32
epochs = 10
# 模型路径
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 转换知识库
KG = Trie()
if os.path.exists('../datasets/KG.json'):
KG.load('../datasets/KG.json')
else:
with open('F:/Projects/data/corpus/kg/KgCLUE/Knowledge_20211215.txt', 'r', encoding='utf-8') as f:
# count = 0
for l in tqdm(f):
s, p, o = l.split('\t')
s, m = subject_split(s)
ids = tokenizer.encode(s, p)[0][1:]
ids += tokenizer.encode(m)[0][1:-1]
KG[ids] = ' '.join(o.split())
# count += 1
# if count > 10000:
# break
KG.save('../datasets/KG.json')
def collate_fn(batch):
"""数据生成器
单条样本:[CLS] Q [SEP] S [SEP] P [SEP] M [SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for (q, a) in batch:
q_ids = tokenizer.encode(q, maxlen=maxlen // 2 + 1)[0]
a_ids = tokenizer.encode(a[0], a[1])[0]
a_ids += tokenizer.encode(a[2])[0][1:]
token_ids = (q_ids + a_ids[1:])[:maxlen]
segment_ids = [0] * len(q_ids)
segment_ids += [1] * (len(token_ids) - len(q_ids))
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
# 读取数据集
train_data = load_data('F:/Projects/data/corpus/kg/KgCLUE/train.json')
train_dataloader = DataLoader(ListDataset(train_data), shuffle=True, collate_fn=collate_fn)
valid_data = load_data('F:/Projects/data/corpus/kg/KgCLUE/dev.json')
test_data = load_data('F:/Projects/data/corpus/kg/KgCLUE/test_public.json')
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model = build_transformer_model(config_path, checkpoint_path, model='roformer', application='unilm').to(device)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 5e-6))
class AutoQA(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
all_token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([all_token_ids, segment_ids])
probas = F.softmax(y_pred[:, -1, :], dim=-1)
new_probas = torch.zeros_like(probas)
for i, ids in enumerate(output_ids):
ids = ids.cpu().numpy()
next_ids = [int(j) for j in KG.next_ones(ids)] # 下一位容许集
# ===========如果t时刻为Pt的前缀树中的短句,带来的信息增益越大,则增加Pt的概率
if len(next_ids) > 1 and self.end_id in ids: # 容许集大于1且已解码出S
candidates = KG.keys(list(ids)) # 可能解码结果
weights = torch.ones_like(probas[i]) # 默认权重为1
lcs0 = lcs(ids, token_ids[i])[0] # 当前已经覆盖的token数
for c in candidates:
if len(c) > len(ids):
c = [int(j) for j in c]
w = lcs(c, token_ids[i])[0] - lcs0 # 未来还可能覆盖的token数
weights[c[len(ids)]] = max(w + 1, weights[c[len(ids)]].cpu().numpy())
probas[i] = torch.pow(probas[i], 1. / weights) # 按 p^(1/n) 来增大权重
if not next_ids: # 如果容许集为空,意味着要结束了
next_ids.append(self.end_id)
new_probas[i, next_ids] += probas[i, next_ids] # 只保留容许集概率
new_probas /= new_probas.sum(axis=1, keepdims=True) # 重新归一化
return new_probas
def generate(self, text, topk=1):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.beam_search([token_ids, segment_ids], topk=topk, min_ends=3) # 基于beam search
end_idxs = [i for i, j in enumerate(output_ids) if j == self.end_id]
subject_ids = output_ids[:end_idxs[0]]
predicate_ids = output_ids[end_idxs[0]:end_idxs[1]]
meaning_ids = output_ids[end_idxs[1]:]
return (
tokenizer.decode(subject_ids.cpu().numpy()), tokenizer.decode(predicate_ids.cpu().numpy()),
tokenizer.decode(meaning_ids.cpu().numpy()), KG[output_ids[:-1].cpu().numpy()]
)
autoqa = AutoQA(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_score = 0
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
em, f1, score = self.evaluate(valid_data, topk=3)
if score >= self.best_score:
self.best_score = score
# model.save_weights('./best_model.weights')
print(
u'[VALID] em: %.5f, f1: %.5f, score: %.5f, best_score: %.5f\n' %
(em, f1, score, self.best_score)
)
def f1sim(self, text_a, text_b):
"""计算两个文本之间的f1相似度
说明:算出两个文本的最长公共子序列长度,然后乘2并处以两者
长度之和。推荐用pylcs算,速度较快。
"""
if not text_a and not text_b:
return 0.
else:
lcs_len = lcs(text_a, text_b)[0]
return 2. * lcs_len / (len(text_a) + len(text_b))
def evaluate(self, data, topk=1):
"""评估函数
注意:同一(S, P)对应的O可能有多个,但标注数据只保留了
一个,为了跟标注数据对齐来提高分数,这里也只保留第一个。
"""
em, f1, total = 0., 0., 0.
for d in tqdm(data, ncols=0):
a = autoqa.generate(d[0], topk=topk)
o = a[3].split('\t')[0] # 如果有多个,只保留第一个
em += float(o == d[1][3])
f1 += self.f1sim(o, d[1][3])
total += 1
em /= total
f1 /= total
return em, f1, (em + f1) / 2
def test_predict(in_file, out_file, topk=1):
"""输出测试结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw = open(out_file, 'w')
with open(in_file) as fr:
for l in tqdm(fr):
l = json.loads(l)
s, p, m, o = autoqa.generate(l['question'], topk=topk)
if m:
s += u'(%s)' % m
l['answer'] = '%s ||| %s ||| %s' % (s, p, o.split('\t')[0])
l = json.dumps(l, ensure_ascii=False)
fw.write(l + '\n')
fw.close()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=None,
epochs=epochs,
callbacks=[evaluator]
)
model.load_weights('./best_model.weights')
em, f1, score = evaluator.evaluate(test_data, topk=1)
print(u'[TEST] topk=1, em: %.5f, f1: %.5f, score: %.5f' % (em, f1, score))
em, f1, score = evaluator.evaluate(test_data, topk=3)
print(u'[TEST] topk=3, em: %.5f, f1: %.5f, score: %.5f' % (em, f1, score))
em, f1, score = evaluator.evaluate(test_data, topk=5)
print(u'[TEST] topk=5, em: %.5f, f1: %.5f, score: %.5f' % (em, f1, score))
else:
model.load_weights('./best_model.weights')
# test_predict('../datasets/test.json', 'kgclue_predict.json', topk=3)
\ No newline at end of file
#! -*- coding: utf-8- -*-
# 用Seq2Seq做阅读理解构建
# 根据篇章先采样生成答案,然后采样生成问题
# 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
# 基本参数
max_p_len = 128
max_q_len = 64
max_a_len = 16
batch_size = 24
epochs = 100
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def process_data():
if os.path.exists('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'):
return
# 标注数据
webqa_data = json.load(open('F:/Projects/data/corpus/qa/WebQA.json', encoding='utf-8'))
sogou_data = json.load(open('F:/Projects/data/corpus/qa/SogouQA.json', encoding='utf-8'))
# 筛选数据
seps, strips = u'\n。!?!?;;,, ', u';;,, '
data = []
for d in webqa_data + sogou_data:
for p in d['passages']:
if p['answer']:
for t in text_segmentate(p['passage'], max_p_len - 2, seps, strips):
if p['answer'] in t:
data.append((t, d['question'], p['answer']))
del webqa_data
del sogou_data
# 保存一个随机序(供划分valid用)
random_order = list(range(len(data)))
np.random.seed(2022)
np.random.shuffle(random_order)
# 划分valid
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
json.dump(train_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json', 'w'), indent=4)
json.dump(valid_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json', 'w'), indent=4)
process_data()
class MyDataset(ListDataset):
@staticmethod
def load_data(file_path):
return json.load(open(file_path))
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for (p, q, a) in batch:
p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len + 1)
a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len)
q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len)
token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:] # 去掉answer和question的cls位
segment_ids = [0] * len(p_token_ids)
segment_ids += [1] * (len(token_ids) - len(p_token_ids))
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class QuestionAnswerGeneration(AutoRegressiveDecoder):
"""随机生成答案,并且通过beam search来生成问题
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([token_ids, segment_ids])
return y_pred[:, -1, :]
def generate(self, passage, topk=1, topp=0.95):
token_ids, segment_ids = tokenizer.encode(passage, maxlen=max_p_len)
a_ids = self.random_sample([token_ids, segment_ids], 1, topp=topp)[0] # 基于随机采样
token_ids += list(a_ids)
segment_ids += [1] * len(a_ids)
q_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search
return (tokenizer.decode(q_ids.cpu().numpy()), tokenizer.decode(a_ids.cpu().numpy()))
qag = QuestionAnswerGeneration(start_id=None, end_id=tokenizer._token_end_id, maxlen=max_q_len, device=device)
def predict_to_file(data, filename, topk=1):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q, a = qag.generate(d[0])
s = '%s\t%s\t%s\n' % (q, a, d[0])
f.write(s)
f.flush()
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
predict_to_file(valid_dataset.data[:100], 'qa.csv')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=100,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
# predict_to_file(valid_data, 'qa.csv')
#! -*- coding: utf-8 -*-
# 用MLM的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import Callback, ListDataset
from tqdm import tqdm
import torch
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import re
import torch.nn.functional as F
# 基本参数
max_p_len = 256
max_q_len = 64
max_a_len = 32
batch_size = 12
epochs = 10
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def process_data():
if os.path.exists('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'):
return
# 标注数据
webqa_data = json.load(open('F:/Projects/data/corpus/qa/WebQA.json', encoding='utf-8'))
sogou_data = json.load(open('F:/Projects/data/corpus/qa/SogouQA.json', encoding='utf-8'))
# 保存一个随机序(供划分valid用)
random_order = list(range(len(sogou_data)))
np.random.seed(2022)
np.random.shuffle(random_order)
# 划分valid
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合
json.dump(train_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json', 'w', encoding='utf-8'), indent=4)
json.dump(valid_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json', 'w', encoding='utf-8'), indent=4)
process_data()
class MyDataset(ListDataset):
@staticmethod
def load_data(file_path):
return json.load(open(file_path))
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式为
输入: [CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
输出: 答案
"""
batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
for D in batch:
question = D['question']
answers = [p['answer'] for p in D['passages'] if p['answer']]
passage = np.random.choice(D['passages'])['passage']
passage = re.sub(u' |、|;|,', ',', passage)
final_answer = ''
for answer in answers:
if all([a in passage[:max_p_len - 2] for a in answer.split(' ')]):
final_answer = answer.replace(' ', ',')
break
a_token_ids, _ = tokenizer.encode(final_answer, maxlen=max_a_len + 1)
q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
token_ids = [tokenizer._token_start_id]
token_ids += ([tokenizer._token_mask_id] * max_a_len)
token_ids += [tokenizer._token_end_id]
token_ids += (q_token_ids[1:] + p_token_ids[1:])
segment_ids = [0] * len(token_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_a_token_ids.append(a_token_ids[1:])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_a_token_ids = torch.tensor(sequence_padding(batch_a_token_ids, max_a_len), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_a_token_ids
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, max_a_len]
'''
_, y_pred = outputs
y_pred = y_pred[:, 1:max_a_len+1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = y_true.flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
def get_ngram_set(x, n):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result = {}
for i in range(len(x) - n + 1):
k = tuple(x[i:i + n])
if k[:-1] not in result:
result[k[:-1]] = set()
result[k[:-1]].add(k[-1])
return result
def gen_answer(question, passages):
"""由于是MLM模型,所以可以直接argmax解码。
"""
all_p_token_ids, token_ids, segment_ids = [], [], []
for passage in passages:
passage = re.sub(u' |、|;|,', ',', passage)
p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
all_p_token_ids.append(p_token_ids[1:])
token_ids.append([tokenizer._token_start_id])
token_ids[-1] += ([tokenizer._token_mask_id] * max_a_len)
token_ids[-1] += [tokenizer._token_end_id]
token_ids[-1] += (q_token_ids[1:] + p_token_ids[1:])
segment_ids.append([0] * len(token_ids[-1]))
token_ids = torch.tensor(sequence_padding(token_ids), device=device)
segment_ids = torch.tensor(sequence_padding(segment_ids), device=device)
logit = model.predict([token_ids, segment_ids])[-1][:, 1:max_a_len+1, :]
probas = F.softmax(logit, dim=-1)
results = {}
for t, p in zip(all_p_token_ids, probas):
a, score = tuple(), 0.
for i in range(max_a_len):
idxs = list(get_ngram_set(t, i + 1)[a])
if tokenizer._token_end_id not in idxs:
idxs.append(tokenizer._token_end_id)
# pi是将passage以外的token的概率置零
pi = torch.zeros_like(p[i])
pi[idxs] = p[i, idxs]
a = a + (pi.argmax().item(),)
score += pi.max().item()
if a[-1] == tokenizer._token_end_id:
break
score = score / (i + 1)
a = tokenizer.decode(a)
if a:
results[a] = results.get(a, []) + [score]
results = {
k: (np.array(v)**2).sum() / (sum(v) + 1)
for k, v in results.items()
}
return results
def max_in_dict(d):
if d:
return sorted(d.items(), key=lambda s: -s[1])[0][0]
def predict_to_file(data, filename):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q_text = d['question']
p_texts = [p['passage'] for p in d['passages']]
a = gen_answer(q_text, p_texts)
a = max_in_dict(a)
if a:
s = u'%s\t%s\n' % (d['id'], a)
else:
s = u'%s\t\n' % (d['id'])
f.write(s)
f.flush()
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
predict_to_file(valid_dataset.data[:100], 'qa.csv')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=100,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
# predict_to_file(valid_data, 'qa.csv')
#! -*- coding: utf-8 -*-
# 用seq2seq的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import re
# 基本参数
max_p_len = 256
max_q_len = 64
max_a_len = 32
max_qa_len = max_q_len + max_a_len
batch_size = 8
epochs = 10
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def process_data():
if os.path.exists('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'):
return
# 标注数据
webqa_data = json.load(open('F:/Projects/data/corpus/qa/WebQA.json', encoding='utf-8'))
sogou_data = json.load(open('F:/Projects/data/corpus/qa/SogouQA.json', encoding='utf-8'))
# 保存一个随机序(供划分valid用)
random_order = list(range(len(sogou_data)))
np.random.seed(2022)
np.random.shuffle(random_order)
# 划分valid
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合
json.dump(train_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json', 'w', encoding='utf-8'), indent=4)
json.dump(valid_data, open('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json', 'w', encoding='utf-8'), indent=4)
process_data()
class MyDataset(ListDataset):
@staticmethod
def load_data(file_path):
return json.load(open(file_path))
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式: [CLS]篇章[SEP]问题[SEP]答案[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for D in batch:
question = D['question']
answers = [p['answer'] for p in D['passages'] if p['answer']]
passage = np.random.choice(D['passages'])['passage']
passage = re.sub(u' |、|;|,', ',', passage)
final_answer = ''
for answer in answers:
if all([a in passage[:max_p_len - 2] for a in answer.split(' ')]):
final_answer = answer.replace(' ', ',')
break
qa_token_ids, qa_segment_ids = tokenizer.encode(question, final_answer, maxlen=max_qa_len + 1)
p_token_ids, p_segment_ids = tokenizer.encode(passage, maxlen=max_p_len + 1)
token_ids = p_token_ids + qa_token_ids[1:]
segment_ids = p_segment_ids + qa_segment_ids[1:]
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json')
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class ReadingComprehension(AutoRegressiveDecoder):
"""beam search解码来生成答案
passages为多篇章组成的list,从多篇文章中自动决策出最优的答案,
如果没答案,则返回空字符串。
mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。
"""
def __init__(self, mode='extractive', **kwargs):
super(ReadingComprehension, self).__init__(**kwargs)
self.mode = mode
def get_ngram_set(self, x, n):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result = {}
for i in range(len(x) - n + 1):
k = tuple(x[i:i + n])
if k[:-1] not in result:
result[k[:-1]] = set()
result[k[:-1]].add(k[-1])
return result
@AutoRegressiveDecoder.wraps(default_rtype='probas', use_states=True)
def predict(self, inputs, output_ids, states):
inputs = [i for i in inputs if i[0, 0].item() > -1] # 过滤掉无答案篇章
topk = len(inputs[0])
all_token_ids, all_segment_ids = [], []
for token_ids in inputs: # inputs里每个元素都代表一个篇章
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.zeros_like(token_ids)
if states > 0:
segment_ids[:, -output_ids.shape[1]:] = 1
all_token_ids.extend(token_ids)
all_segment_ids.extend(segment_ids)
padded_all_token_ids = sequence_padding(all_token_ids)
padded_all_segment_ids = sequence_padding(all_segment_ids)
_, logits = model.predict([padded_all_token_ids, padded_all_segment_ids])
probas = nn.Softmax(dim=-1)(logits)
# 这里改成用torch.gather来做了
# probas = [probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids)]
# probas = torch.stack(probas).reshape((len(inputs), topk, -1))
index_ = torch.tensor([[len(i)-1] for i in all_token_ids], device=probas.device).view(-1, 1, 1).expand(-1, 1, probas.shape[-1])
probas = torch.gather(probas, dim=1, index=index_).reshape((len(inputs), topk, -1))
if states == 0:
# 这一步主要是排除没有答案的篇章
# 如果一开始最大值就为end_id,那说明该篇章没有答案
argmax = probas[:, 0].argmax(dim=1)
available_idxs = torch.where(argmax != self.end_id)[0]
if len(available_idxs) == 0:
scores = torch.zeros_like(probas[0])
scores[:, self.end_id] = 1
return scores, states + 1
else:
for i in torch.where(argmax == self.end_id)[0]:
inputs[i][:, 0] = -1 # 无答案篇章首位标记为-1
probas = probas[available_idxs]
inputs = [i for i in inputs if i[0, 0] > -1] # 过滤掉无答案篇章
if self.mode == 'extractive':
# 如果是抽取式,那么答案必须是篇章的一个片段
# 那么将非篇章片段的概率值全部置0
new_probas = torch.zeros_like(probas)
ngrams = {}
for token_ids in inputs:
token_ids = token_ids[0]
sep_idx = torch.where(token_ids == tokenizer._token_end_id)[0][0]
p_token_ids = token_ids[1:sep_idx]
for k, v in self.get_ngram_set(p_token_ids.cpu().numpy(), states + 1).items(): # 这里要放到.cpu().numpy(),否则会出现nrams.get不到
ngrams[k] = ngrams.get(k, set()) | v
for i, ids in enumerate(output_ids):
available_idxs = ngrams.get(tuple(ids.cpu().numpy()), set())
available_idxs.add(tokenizer._token_end_id)
available_idxs = list(available_idxs)
new_probas[:, i, available_idxs] = probas[:, i, available_idxs]
probas = new_probas
return (probas**2).sum(0) / (probas.sum(0) + 1), states + 1 # 某种平均投票方式
def answer(self, question, passages, topk=1):
token_ids = []
for passage in passages:
passage = re.sub(u' |、|;|,', ',', passage)
p_token_ids = tokenizer.encode(passage, maxlen=max_p_len)[0]
q_token_ids = tokenizer.encode(question, maxlen=max_q_len + 1)[0]
token_ids.append(p_token_ids + q_token_ids[1:])
output_ids = self.beam_search(token_ids, topk=topk, states=0) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
reader = ReadingComprehension(
start_id=None,
end_id=tokenizer._token_end_id,
maxlen=max_a_len,
mode='extractive',
device=device
)
def predict_to_file(data, filename, topk=1):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q_text = d['question']
p_texts = [p['passage'] for p in d['passages']]
a = reader.answer(q_text, p_texts, topk)
if a:
s = u'%s\t%s\n' % (d['id'], a)
else:
s = u'%s\t\n' % (d['id'])
f.write(s)
f.flush()
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
predict_to_file(valid_dataset.data[:100], 'qa.csv')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=100,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
predict_to_file(valid_dataset.data, 'qa.csv')
#! -*- coding: utf-8 -*-
# 用Seq2Seq做小学数学应用题
# 数据集为ape210k:https://github.com/Chenny0808/ape210k
# 介绍链接:https://kexue.fm/archives/7809
from __future__ import division
import json, re
from tqdm import tqdm
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from torch import nn, optim
import torch
from torch.utils.data import DataLoader
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.snippets import AutoRegressiveDecoder
from sympy import Integer
import warnings
warnings.filterwarnings("ignore")
# 基本参数
maxlen = 192
batch_size = 16
epochs = 100
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def is_equal(a, b):
"""比较两个结果是否相等
"""
a = round(float(a), 6)
b = round(float(b), 6)
return a == b
def remove_bucket(equation):
"""去掉冗余的括号
"""
l_buckets, buckets = [], []
for i, c in enumerate(equation):
if c == '(':
l_buckets.append(i)
elif c == ')':
buckets.append((l_buckets.pop(), i))
eval_equation = eval(equation)
for l, r in buckets:
new_equation = '%s %s %s' % (equation[:l], equation[l + 1:r], equation[r + 1:])
try:
if is_equal(eval(new_equation.replace(' ', '')), eval_equation):
equation = new_equation
except:
pass
return equation.replace(' ', '')
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取训练数据,并做一些标准化,保证equation是可以eval的
参考:https://kexue.fm/archives/7809
"""
D = []
for l in open(filename, 'r', encoding='utf-8'):
l = json.loads(l)
question, equation, answer = l['original_text'], l['equation'], l['ans']
# 处理带分数
question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
equation = re.sub('(\d+)\(', '\\1+(', equation)
answer = re.sub('(\d+)\(', '\\1+(', answer)
# 分数去括号
question = re.sub('\((\d+/\d+)\)', '\\1', question)
# 处理百分数
equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
# 冒号转除号、剩余百分号处理
equation = equation.replace(':', '/').replace('%', '/100')
answer = answer.replace(':', '/').replace('%', '/100')
if equation[:2] == 'x=':
equation = equation[2:]
try:
if is_equal(eval(equation), eval(answer)):
D.append((question, remove_bucket(equation), answer))
except:
continue
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for question, equation, answer in batch:
token_ids, segment_ids = tokenizer.encode(question, equation, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/ape210k/train.ape.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/ape210k/valid.ape.json')
# valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/ape210k/test.ape.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class AutoSolve(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([token_ids, segment_ids])
return y_pred[:, -1, :]
def generate(self, text, topk=1):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy()).replace(' ', '')
autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_acc = 0.
def on_epoch_end(self, steps, epoch, logs=None):
metrics = self.evaluate(valid_dataset.data[:200]) # 评测模型
if metrics['acc'] >= self.best_acc:
self.best_acc = metrics['acc']
# model.save_weights('./best_model_math.pt') # 保存模型
metrics['best_acc'] = self.best_acc
print('valid_data:', metrics)
print()
def evaluate(self, data, topk=1):
total, right = 0.0, 0.0
for question, equation, answer in tqdm(data, desc='Evaluate'):
total += 1
pred_equation = autosolve.generate(question, topk)
try:
right += int(is_equal(eval(pred_equation), eval(answer)))
except:
pass
return {'acc': right / total}
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=500, epochs=epochs, callbacks=[evaluator])
else:
model.load_weights('./best_model.weights')
\ No newline at end of file
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
import torch
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import glob
# 基本参数
maxlen = 256
batch_size = 16
epochs = 10000
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for txt in batch:
text = open(txt, encoding='utf-8').read()
text = text.split('\n')
if len(text) > 1:
title = text[0]
content = '\n'.join(text[1:])
token_ids, segment_ids = tokenizer.encode(content, title, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(ListDataset(glob.glob('F:/Projects/data/corpus/sentence_classification/THUCNews/*/*.txt')),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([token_ids, segment_ids])
return y_pred[:, -1, :]
def generate(self, text, topk=1, topp=0.95):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, device=device)
def just_show():
s1 = u'夏天来临,皮肤在强烈紫外线的照射下,晒伤不可避免,因此,晒后及时修复显得尤为重要,否则可能会造成长期伤害。专家表示,选择晒后护肤品要慎重,芦荟凝胶是最安全,有效的一种选择,晒伤严重者,还请及 时 就医 。'
s2 = u'8月28日,网络爆料称,华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看,数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录,住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证,认为数据真实性非常高。当天下午 ,华 住集 团发声明称,已在内部迅速开展核查,并第一时间报警。当晚,上海警方消息称,接到华住集团报案,警方已经介入调查。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show()
if __name__ == '__main__':
just_show()
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=100,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用BART方案
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from bert4torch.snippets import ListDataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import json
from rouge import Rouge
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append([tokenizer._token_end_id] + token_ids) # 预训练时候是用[SEP]开头的
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
test_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json')
model = build_transformer_model(config_path, checkpoint_path, model='bart', keep_tokens=keep_tokens, segment_vocab_size=0).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1.5e-5))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=tokenizer._token_end_id, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data) # 评测模型
metrics_test = self.evaluate(test_dataset.data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
print('test_data:', metrics_test)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
if __name__ == '__main__':
evaluator = Evaluator()
just_show()
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# 微调多国语言版T5做Seq2Seq任务
# 介绍链接:https://kexue.fm/archives/7867
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
# mt5主要特点:gated-gelu, decoder的最后的dense层独立权重,rmsnorm
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import SpTokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
token_pad_ids = -100
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/pytorch_model.bin'
# 下面两个config是从bert4keras中拿的,项目连接https://github.com/bojone/t5_in_bert4keras
spm_path = 'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn.model'
keep_tokens_path = 'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn_keep_tokens.json'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>')
keep_tokens = json.load(open(keep_tokens_path))
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append([0] + token_ids)
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids, value=token_pad_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids, value=token_pad_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
test_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json')
model = build_transformer_model(
config_path,
checkpoint_path,
model='mt5.1.1',
segment_vocab_size=0,
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
token_pad_ids=token_pad_ids, # 也可以指定custom_attention_mask并传入attention_mask来实现
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=token_pad_ids), optimizer=optim.Adam(model.parameters(), 1e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode([int(i) for i in output_ids.cpu().numpy()])
autotitle = AutoTitle(start_id=0, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data) # 评测模型
metrics_test = self.evaluate(test_dataset.data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
print('test_data:', metrics_test)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
evaluator = Evaluator()
print(u'生成标题:', autotitle.generate(u'中国的首都是extra0京')) # 和huggingface的结果一致
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# 微调T5 PEGASUS做Seq2Seq任务, 使用到是BertTokenizer
# 介绍链接:https://kexue.fm/archives/8209
# 权重转换脚本: https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_t5_pegasus.py
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import jieba
jieba.initialize()
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
# bert配置
pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path = pretrain_model + 'config.json'
checkpoint_path = pretrain_model + 'pytorch_model.bin'
dict_path = pretrain_model + 'vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
tokenizer = Tokenizer(
dict_path,
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False)
)
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append(token_ids)
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
test_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json')
model = build_transformer_model(
config_path,
checkpoint_path,
model='mt5.1.1',
segment_vocab_size=0,
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode([int(i) for i in output_ids.cpu().numpy()])
autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data) # 评测模型
metrics_test = self.evaluate(test_dataset.data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
print('test_data:', metrics_test)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
evaluator = Evaluator()
print(u'生成标题:', autotitle.generate(u'今天天气不错啊'))
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# 微调uer版T5做Seq2Seq任务
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append(token_ids)
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
test_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json')
model = build_transformer_model(
config_path,
checkpoint_path,
model='t5.1.0',
segment_vocab_size=0,
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data) # 评测模型
metrics_test = self.evaluate(test_dataset.data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
print('test_data:', metrics_test)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
evaluator = Evaluator()
print(u'生成标题:', autotitle.generate('中国的首都是extra0京'))
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
maxlen = 256
batch_size = 16
epochs = 50
steps_per_epoch = None
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:[CLS]文章[SEP]标题[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for title, content in batch:
token_ids, segment_ids = tokenizer.encode(content, title, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
test_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json')
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([token_ids, segment_ids])
return y_pred[:, -1, :]
def generate(self, text, topk=1, topp=0.95):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data) # 评测模型
metrics_test = self.evaluate(test_dataset.data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
print('test_data:', metrics_test)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
just_show()
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment