"megatron/core/pipeline_parallel/schedules.py" did not exist on "b93bef00d4fdcd8adc9a276e9834c7714aa559c2"
Commit 0e29b9b7 authored by xuxo's avatar xuxo
Browse files

yidong infer init

parents
Pipeline #3252 failed with stages
in 0 seconds
#! -*- coding:utf-8 -*-
# 情感分类例子,超长文本采用hierarchical_position层次分解位置编码,spaces.ac.cn/archives/7947
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
maxlen = 1024
batch_size = 3
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
# 这里指定了hierarchical_position和max_position,把原有的position
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, hierarchical_position=True, max_position=1024, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类例子,加载nezha权重
# valid_acc: 95.07, test_acc: 94.72
import numpy as np
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
# 指定好model=nezha和对应的ckpt地址
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='nezha', with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类例子,RoPE相对位置编码
# 官方项目:https://github.com/ZhuiyiTechnology/roformer
# pytorch参考项目:https://github.com/JunnYu/RoFormer_pytorch
# valid_acc: 94.85, test_acc: 94.42
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import random
import os
import numpy as np
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
# 指定好model和对应的ckpt地址
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer', with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类例子,RoPE相对位置编码
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-v2
# pytorch参考项目:https://github.com/JunnYu/RoFormer_pytorch
# valid_acc: 95.78, test_acc: 96.09
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
# 指定好model和对应的ckpt地址
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer_v2')
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
last_hidden_state = self.bert([token_ids, segment_ids])
output = self.dropout(last_hidden_state[:, 0, :])
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务: xlnet
# transformer包中tokenizer是padding在前面的
# 这里可以使用transformer的tokenizer,也可以使用SpTokenizer,注意取最后一位时候取非padding的最后一位
# valid_acc: 95.00, test_acc: 94.24
from bert4torch.tokenizers import SpTokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
pretrain_model = 'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base/'
config_path = pretrain_model + 'bert4torch_config.json'
checkpoint_path = pretrain_model + 'pytorch_model.bin'
spm_path = pretrain_model + 'spiece.model'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = SpTokenizer(spm_path, token_start=None, token_end='<cls>')
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append([label])
# 用tokenizer的pad_id来做padding
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, value=tokenizer._token_pad_id), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='xlnet',
token_pad_ids=tokenizer._token_pad_id, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(768, 2)
def forward(self, token_ids):
last_hidden_state = self.bert([token_ids])
# 取最后一位<cls>位的隐含层状态
last_token_idx = token_ids.not_equal(tokenizer._token_pad_id).sum(dim=1) - 1
last_token_idx = last_token_idx[:, None, None].expand(last_hidden_state.shape[0], 1, last_hidden_state.shape[-1])
pooling = torch.gather(last_hidden_state, dim=1, index=last_token_idx).squeeze(1)
output = self.dropout(pooling)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
test_acc = evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# 召回+排序两阶段模型
金融场景FAQ解决方案
## 建模思路
1. 阶段一:MultiNegativeRankingLoss来做有监督的语义相似度任务
2. 利用1阶段训练好的模型,用向量相似度为所有的相似问q_sim召回最相近的K个标准问q_std_pred, 其中等于q_std的为正样本,不等于的为困难负样本
3. 阶段二:ContrastiveLoss来做有监督的语义相似度任务
4. 预测:一个query通过阶段一模型找到topK个标问q_std, 然后通过阶段二模型从topK个标问中找到最可能的标问
## 优缺点分析
- 阶段一的训练自动为阶段二模型构造困难样本,类似于Boosting的思想,进一步提升准确率
## 文件说明
| 文件名 | 文件描述 |
| ---- | ---- |
| task_sentence_embedding_FinanceFAQ_step1_0.ipynb | 阶段一模型数据生成 |
| task_sentence_embedding_FinanceFAQ_step1_1.py | 阶段一模型训练 |
| task_sentence_embedding_FinanceFAQ_step2_0.ipynb | 阶段二模型数据生成 |
| task_sentence_embedding_FinanceFAQ_step2_1.ipynb | 阶段二模型训练 |
| task_sentence_embedding_FinanceFAQ_step3_predict.ipynb | 模型效果评估 |
| task_sentence_embedding_FinanceFAQ_step3_inference.ipynb | 单条样本推理 |
## 指标
- 评测数据集:所有标问相似问pair(样本内)
- 指标:recall(正确标问在召回的TopK中的比例)
| 阶段 | Top1 | Top3 | Top5 | Top10 |
|----|----|----|----|----|
|一阶段raw方式|91.32|97.94|98.91|99.57|
|一阶段random方式|88.19|95.93|97.56|98.82|
|一阶段mul_ce方式|90.32|97.51|98.67|99.44|
|二阶段|98.00|99.47|99.79|100|
|一阶段raw方式+二阶段整体|97.54|99.00|99.33|99.50|
## requirements
transformers==4.15.0
\ No newline at end of file
# 模型文件地址
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
data_dir = 'F:/Projects/data/corpus/qa/FinanceFAQ'
q_std_file = f'{data_dir}/q_std_file.tsv' # 标准问数据
q_corpus_file = f'{data_dir}/q_corpus_file.tsv' # 所有语料数据
q_sim_file = f'{data_dir}/q_sim_file.tsv'
# 一阶段训练
fst_train_file = f'{data_dir}/fst_train.tsv'
fst_dev_file = f'{data_dir}/fst_dev.tsv'
ir_path = f'{data_dir}/fst_ir_corpus.tsv'
fst_q_std_vectors_file = f'{data_dir}/fst_q_std_vectors_file.npy'
fst_q_corpus_vectors_file = f'{data_dir}/fst_q_corpus_vectors_file.npy'
fst_std_data_results = f'{data_dir}/fst_std_data_results.tsv'
fst_eval_path_list = [f'{data_dir}/fst_eval.tsv']
# 二阶段
sec_train_file = f'{data_dir}/sec_train_file.tsv'
sec_dev_file = f'{data_dir}/sec_dev_file.tsv'
sec_test_file = f'{data_dir}/sec_test_file.tsv'
sec_q_std_vectors_file = f'{data_dir}/sec_q_std_vectors_file.npy'
sec_q_corpus_vectors_file = f'{data_dir}/sec_q_corpus_vectors_file.npy'
sec_eval_path_list = []
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"source": [
"from config import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"q_std_map = pd.read_csv('F:/Projects/data/corpus/qa/FinanceFAQ/input/q_std.tsv', sep='\\t', encoding='utf-8')['0'].to_dict()\n",
"query_pair = pd.read_csv('F:/Projects/data/corpus/qa/FinanceFAQ/input/query_pair_0.tsv', sep='\\t', encoding='utf-8')\n",
"query_pair['q_std'] = query_pair['q_std'].map(q_std_map)\n",
"query_pair.to_csv(fst_train_file, sep='\\t', encoding='utf-8', index=False)\n",
"query_pair.iloc[5:9]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>q_std</th>\n",
" <th>q_sim</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>现金利能否直接购买股票</td>\n",
" <td>就是说现金利是可以卖股票的对吗</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>损益表的介绍</td>\n",
" <td>损益表是啥</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>#股票名称#季度报告</td>\n",
" <td>详细说下600338第一季报吧</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>未成交的介绍</td>\n",
" <td>需要知道未成交是什么</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" q_std q_sim\n",
"5 现金利能否直接购买股票 就是说现金利是可以卖股票的对吗\n",
"6 损益表的介绍 损益表是啥\n",
"7 #股票名称#季度报告 详细说下600338第一季报吧\n",
"8 未成交的介绍 需要知道未成交是什么"
]
},
"metadata": {},
"execution_count": 9
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"query_pair.q_sim.str.len().describe()"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"count 50000.000000\n",
"mean 18.549940\n",
"std 7.961594\n",
"min 1.000000\n",
"25% 13.000000\n",
"50% 18.000000\n",
"75% 23.000000\n",
"max 108.000000\n",
"Name: q_sim, dtype: float64"
]
},
"metadata": {},
"execution_count": 10
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 11,
"source": [
"# 为每个q_std随机选择一个q_sim作为dev集\n",
"q_std_list = query_pair['q_std'].unique().tolist()\n",
"query_pair['test_rnd'] = query_pair.q_std.apply(lambda x: np.random.rand())\n",
"query_pair['nrank_test'] = query_pair.groupby('q_std')['test_rnd'].rank(ascending=0, method='first')\n",
"dev_query_pair = query_pair[query_pair.nrank_test<=1][['q_std', 'q_sim']]\n",
"dev_query_pair.head(5)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>q_std</th>\n",
" <th>q_sim</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>回档的介绍</td>\n",
" <td>回档是什么东西</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>合约未了结情况下,卖出担保品时,提示零股不支持交易的解决方式</td>\n",
" <td>合约未了结情况下,卖出担保品时,解决提醒零股不能买卖的方式可能是什么</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>任职基金数大于#数字实体#的基金经理有哪些</td>\n",
" <td>想知道任职基金数大于50的基金经理有哪些</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>市销率大于#数字实体#的行业有哪些</td>\n",
" <td>我来咨询看看市销率大于100行业都有啥</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>竞价涨幅不小于#数字实体#的#地域板块#股票有哪些</td>\n",
" <td>给我发下都有啥内蒙板块股票是竞价涨幅不少于50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" q_std q_sim\n",
"1 回档的介绍 回档是什么东西\n",
"11 合约未了结情况下,卖出担保品时,提示零股不支持交易的解决方式 合约未了结情况下,卖出担保品时,解决提醒零股不能买卖的方式可能是什么\n",
"15 任职基金数大于#数字实体#的基金经理有哪些 想知道任职基金数大于50的基金经理有哪些\n",
"16 市销率大于#数字实体#的行业有哪些 我来咨询看看市销率大于100行业都有啥\n",
"31 竞价涨幅不小于#数字实体#的#地域板块#股票有哪些 给我发下都有啥内蒙板块股票是竞价涨幅不少于50"
]
},
"metadata": {},
"execution_count": 11
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 12,
"source": [
"# 为所有的query配一个qid\n",
"q_std_dev = dev_query_pair.q_std.unique().tolist()\n",
"q_sim_dev = dev_query_pair.q_sim.unique().tolist()\n",
"q_qid = q_std_dev + q_sim_dev\n",
"q_qid = list(set(q_qid))\n",
"q_qid_dict = {i+1:q_qid[i] for i in range(0, len(q_qid))} # {id: query}\n",
"q_qid_dict_inv = {v: k for k, v in q_qid_dict.items()} # {query: id}\n",
"\n",
"# 建立ir_corpus: [q_id, q_std]的映射\n",
"ir_corpus = {q_qid_dict_inv[v]: v for v in q_std_list if v not in q_sim_dev}\n",
"ir_corpus_df = pd.DataFrame(list(ir_corpus.items()), columns=['qid', 'question']).sort_values('qid').reset_index(drop=True)\n",
"ir_corpus_df.to_csv(ir_path, sep='\\t', index=False)\n",
"\n",
"# 保存dev\n",
"dev_query_pair['qid'] = dev_query_pair.q_sim.map(q_qid_dict_inv)\n",
"dev_query_pair['duplicate_qids'] = dev_query_pair.q_std.map(q_qid_dict_inv)\n",
"dev_query_pair.duplicate_qids = dev_query_pair.duplicate_qids.astype('str')\n",
"dev_query_pair = dev_query_pair.groupby(['q_sim', 'qid']).apply(lambda v: ','.join(v['duplicate_qids'])).reset_index(name='duplicate_qids')[['qid', 'q_sim', 'duplicate_qids']]\n",
"dev_query_pair.to_csv(fst_dev_file, sep='\\t', index=False)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"print('读取数据集并分别保存标问、相似问、所有语料: ', fst_train_file)\n",
"std_data = pd.read_csv(fst_train_file, sep=\"\\t\")\n",
"q_std_list = std_data.q_std.unique().tolist() # 标准问list\n",
"q_sim_list = std_data.q_sim.unique().tolist() # 相似问list\n",
"q_corpus = list(set(q_std_list + q_sim_list))\n",
"\n",
"q_std_df = pd.DataFrame(q_std_list, columns=['q'])\n",
"q_corpus_df = pd.DataFrame(q_corpus, columns=['q'])\n",
"q_sim_df = pd.DataFrame(q_sim_list, columns=['q'])\n",
"\n",
"q_std_df.to_csv(q_std_file, index=None, header=False, sep=\"\\t\")\n",
"q_corpus_df.to_csv(q_corpus_file, index=None, header=False, sep=\"\\t\")\n",
"q_sim_df.to_csv(q_sim_file, index=None, header=False, sep=\"\\t\")\n",
"\n",
"print('q_std_list:——>', len(q_std_list), 'q_sim_list:——>', len(q_sim_list), 'q_corpus:——>', len(q_corpus))"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
#! -*- coding:utf-8 -*-
# loss: MultiNegativeRankingLoss, 和simcse一样,以batch中其他样本作为负样本
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sentence_transformers import evaluation
from config import config_path, checkpoint_path, dict_path, fst_train_file, fst_dev_file, ir_path
import numpy as np
import pandas as pd
import random
import os
# 固定seed
seed_everything(42)
maxlen = 64
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# raw: 原始的版本
# random: 同一个标准问(组内)随机采样,组间互为负样本
# mul_ce: 原始版本修改版,组间也有正样本(标准问一致的时候)
choice = 'mul_ce'
print(f'using {choice} mode in step1 model'.center(60, '-'))
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
if choice in {'raw', 'mul_ce'}:
# 原始模式,可能同一个batch中会出现重复标问
def collate_fn(batch):
if choice == 'raw':
labels = torch.arange(len(batch), device=device)
else:
labels = torch.eye(len(batch), dtype=torch.long, device=device)
# 定位相同元素
for i, (q_std1, _) in enumerate(batch):
for j, (q_std2, _) in enumerate(batch[i+1:], start=i+1):
if q_std1 == q_std2:
labels[i, j] = 1
labels[j, i] = 1
texts_list = [[] for _ in range(2)]
for texts in batch:
for i, text in enumerate(texts):
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
texts_list[i].append(token_ids)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
return texts_list, labels
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for row, l in enumerate(f):
if row == 0: # 跳过首行
continue
q_std, q_sim = l.strip().split('\t')
D.append((q_std.replace(' ', ''), q_sim.replace(' ', '')))
return D
elif choice == 'random':
# 以标准问为key的键值对, 保证一个batch内不存在同样q_std的样本
def collate_fn(batch):
texts_list = [[] for _ in range(2)]
for text_list in batch: # q_std有0.5的概率被抽样到
p = [0.5] + [0.5/(len(text_list)-1)] * (len(text_list)-1)
texts = np.random.choice(text_list, 2, replace=False, p=p)
for i, text in enumerate(texts):
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
texts_list[i].append(token_ids)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.arange(texts_list[0].size(0), device=texts_list[0].device)
return texts_list, labels
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = dict()
with open(filename, encoding='utf-8') as f:
for row, l in enumerate(f):
if row == 0: # 跳过首行
continue
q_std, q_sim = l.strip().split('\t')
q_std = q_std.replace(' ', '')
q_sim = q_sim.replace(' ', '')
D[q_std] = D.get(q_std, []) + [q_sim]
return [[k]+v for k, v in D.items()]
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls', scale=20.0):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.pool_method = pool_method
self.scale = scale
def forward(self, token_ids_list):
reps = []
for token_ids in token_ids_list:
hidden_state1, pool_cls1 = self.bert([token_ids])
rep = get_pool_emb(hidden_state1, pool_cls1, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz*2]
return scores
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.bert([token_ids])
output = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return output
def encode(self, texts, **kwargs):
token_ids_list = []
for text in texts:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
token_ids_list.append(token_ids)
token_ids_tensor = torch.tensor(sequence_padding(token_ids_list), dtype=torch.long)
valid_dataloader = DataLoader(TensorDataset(token_ids_tensor), batch_size=batch_size)
valid_sen_emb = []
self.eval()
for token_ids in tqdm(valid_dataloader, desc='Evaluate'):
token_ids = token_ids[0].to(device)
output = self.predict(token_ids)
valid_sen_emb.append(output.cpu())
valid_sen_emb = torch.cat(valid_sen_emb, dim=0)
return valid_sen_emb
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = Model().to(device)
# 多分类
class Myloss(nn.Module):
def forward(self, y_pred, y_true):
y_pred = torch.log(torch.softmax(y_pred, dim=-1)) * y_true # [btz, btz]
return -y_pred.sum() / len(y_pred)
# y_pred_pos = (y_pred * y_true).sum(dim=-1)
# y_pred_sum = torch.logsumexp(y_pred, dim=-1)
# return (y_pred_sum - y_pred_pos).sum() / len(y_pred)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss = Myloss() if choice == 'mul_ce' else nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
def __init__(self):
super().__init__()
self.best_perf = 0
def on_dataloader_end(self, logs=None):
model.train_dataloader = DataLoader(MyDataset(fst_train_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def on_epoch_end(self, global_step, epoch, logs=None):
perf = evaluate(model, epoch=model.epoch, steps=model.global_step, output_path='./')
if perf > self.best_perf:
self.best_perf = perf
model.save_weights(f'./fst_best_weights_{choice}.pt')
print(f'perf: {perf:.2f}, best perf: {self.best_perf:.2f}\n')
if __name__ == '__main__':
# 训练集
train_dataloader = DataLoader(MyDataset(fst_train_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 验证集
ir_queries, ir_corpus, ir_relevant_docs = {}, {}, {}
with open(fst_dev_file, 'r', encoding='utf-8') as f:
next(f)
for line in f:
qid, query, duplicate_ids = line.strip().split('\t')
duplicate_ids = duplicate_ids.split(',')
ir_queries[qid] = query
ir_relevant_docs[qid] = set(duplicate_ids)
ir_corpus_df = pd.read_csv(ir_path, sep='\t')
ir_corpus_df.qid = ir_corpus_df.qid.astype('str')
ir_corpus = dict(zip(ir_corpus_df.qid.tolist(), ir_corpus_df.question.tolist()))
evaluate = evaluation.InformationRetrievalEvaluator(ir_queries, ir_corpus, ir_relevant_docs, name=choice)
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=10,
steps_per_epoch=100,
callbacks=[evaluator]
)
else:
model.load_weights(f'./fst_best_weights_{choice}.pt')
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 准备二阶段训练数据集\n",
"### 1. 用一阶段模型把所有query转成向量"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from config import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model\n",
"\n",
"# 读取标问和所有语料\n",
"q_std_list = pd.read_csv(q_std_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"q_corpus = pd.read_csv(q_corpus_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"\n",
"# get embeddings\n",
"q_std_sentence_embeddings = model.encode(q_std_list)\n",
"np.save(fst_q_std_vectors_file, q_std_sentence_embeddings.numpy())\n",
"q_corpus_sentence_embeddings = model.encode(q_corpus)\n",
"np.save(fst_q_corpus_vectors_file, q_corpus_sentence_embeddings.numpy())\n",
"print('标准问向量路径:', fst_q_std_vectors_file)\n",
"print('所有语料保存向量路径:', fst_q_corpus_vectors_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. 为每个q_sim找到topK的的q_std"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model\n",
"from config import *\n",
"from utils import *\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"\n",
"print('----加载一阶段训练(标问-相似问)数据集', fst_train_file)\n",
"df_eval = pd.read_csv(fst_train_file, sep=\"\\t\")\n",
"print(\"shape: \", df_eval.shape)\n",
"df_eval = df_eval[df_eval.q_std.isin(q_std_list)]\n",
"print(\"shape: \", df_eval.shape)\n",
"\n",
"df_eval = cal_performance(model, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=20)\n",
"df_eval.to_csv(fst_std_data_results, index=None, sep=\"\\t\")\n",
"df_eval.iloc[3:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. 二阶段正负样本生成\n",
"预测的topK中和q_std一致的为正样本,不一致的为困难负样本"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"xdf = df_eval.copy(deep=True)\n",
"# xdf['q_std_pred_list']=xdf.q_std_pred_list.apply(lambda v:eval(v))\n",
"print('预测结果中和q_std不一致的'.center(60, '-'))\n",
"xdf['q_std_pred_list_else'] = xdf.apply(lambda row: [v for v in row['q_std_pred_list'] if v[0] != row['q_std']], axis=1)\n",
"xdf['q_std_pred_list_else_v1'] = xdf.q_std_pred_list_else.apply(lambda v: [m[0] for m in v]) # 负样本的文本\n",
"xdf['q_std_pred_list_else_v2'] = xdf.q_std_pred_list_else.apply(lambda v: [m[1] for m in v]) # 负样本的概率\n",
"\n",
"print('组织正负样本'.center(60, '-'))\n",
"xdf['pairs'] = xdf.apply(lambda row: ['1' + '\\t' + row['q_sim'] + '\\t' + row['q_std'] + '\\t' + '1'] + [\n",
" '0' + '\\t' + row['q_sim'] + '\\t' + v[0] + '\\t' + str(v[1]) for v in row['q_std_pred_list_else'][0:10]], axis=1)\n",
"print(xdf.iloc[3]['pairs'])\n",
"\n",
"print('单独处理正负样本'.center(60, '-'))\n",
"q_sim_list = xdf.q_sim.unique().tolist()\n",
"q_std_list = xdf.q_std.unique().tolist()\n",
"q_sim_dict = {q_sim_list[i]: i for i in range(0, len(q_sim_list))}\n",
"q_std_dict = {q_std_list[i]: i for i in range(0, len(q_std_list))}\n",
"pairs = xdf.pairs.tolist()\n",
"pairs_list = [v.split('\\t') for vlist in pairs for v in vlist]\n",
"pairs_df = pd.DataFrame(pairs_list, columns=['label', 'q_sim', 'q_std', 'prob'])\n",
"print(pairs_df.drop_duplicates(['q_std', 'q_sim']).shape)\n",
"pairs_df.head()\n",
"\n",
"pairs_df_2 = pairs_df.sort_values('label', ascending=0).drop_duplicates(['q_sim', 'q_std'])\n",
"pairs_df_final = pairs_df_2\n",
"print(pairs_df_final.shape, pairs_df.shape)\n",
"\n",
"print('对于每一个q_sim,仅保留概率最高的10条样本'.center(60, '-'))\n",
"pairs_df_final['prob'] = pairs_df_final.prob.astype(\"float\")\n",
"pairs_df_final['nrank'] = pairs_df_final.groupby(['label', 'q_sim'])['prob'].rank(ascending=0, method='first')\n",
"df_final = pairs_df_final[pairs_df_final.nrank <= 9].reset_index(drop=True)\n",
"df_final['sim_idx'] = df_final.q_sim.map(q_sim_dict)\n",
"df_final['std_idx'] = df_final.q_std.map(q_std_dict)\n",
"df_final = df_final.sort_values(['sim_idx', 'label', 'nrank'], ascending=[1, 0, 1])[['label', 'q_sim', 'q_std']].reset_index(drop=True)\n",
"\n",
"print('对于每一条标问,随机挑选一条样本作为dev集合'.center(60, '-'))\n",
"xdf['dev_rnd'] = xdf.q_std.apply(lambda v: np.random.rand())\n",
"xdf['nrank_dev'] = xdf.groupby('q_std')['dev_rnd'].rank(ascending=0, method='first')\n",
"q_sim_choose_dev = xdf[xdf.nrank_dev <= 1].drop_duplicates(['q_sim']).q_sim.tolist()\n",
"df_train = df_final.copy(deep=True)\n",
"df_dev = df_final[df_final.q_sim.isin(q_sim_choose_dev)]\n",
"print('第二阶段train集: ', sec_train_file, ', shape: ', df_train.shape)\n",
"df_train[['label', 'q_std', 'q_sim']].to_csv(sec_train_file, sep=\"\\t\", index=None, header=False)\n",
"print('第二阶段dev集: ', sec_dev_file, ', shape', df_dev.shape)\n",
"df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_test_file, sep=\"\\t\", index=None, header=False)\n",
"df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_dev_file, sep=\"\\t\", index=None, header=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
#! -*- coding:utf-8 -*-
# 二阶段训练: 基于困难负样本的进一步精排
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import ContrastiveLoss
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from config import config_path, checkpoint_path, dict_path, sec_train_file, sec_dev_file
import numpy as np
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics import roc_auc_score
import random
import os
# 固定seed
seed_everything(42)
maxlen = 64
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
tokens_ids_list = [[] for _ in range(2)]
labels = []
for text1, text2, label in batch:
tokens_ids_list[0].append(tokenizer.encode(text1, maxlen=maxlen)[0])
tokens_ids_list[1].append(tokenizer.encode(text2, maxlen=maxlen)[0])
labels.append(label)
for i, token_ids in enumerate(tokens_ids_list):
tokens_ids_list[i] = torch.tensor(sequence_padding(token_ids), dtype=torch.long, device=device)
labels = torch.tensor(labels, dtype=torch.long, device=device)
return tokens_ids_list, labels
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
label, text1, text2 = l.strip().split('\t')
D.append((text1.replace(' ', ''), text2.replace(' ', ''), int(label)))
return D
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.pool_method = pool_method
def forward(self, token_ids_list):
reps = []
for token_ids in token_ids_list:
hidden_state1, pool_cls1 = self.bert([token_ids])
rep = get_pool_emb(hidden_state1, pool_cls1, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = 1 - torch.cosine_similarity(embeddings_a, embeddings_b)
return scores
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.bert([token_ids])
output = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return output
def encode(self, texts):
token_ids_list = []
for text in texts:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
token_ids_list.append(token_ids)
token_ids_tensor = torch.tensor(sequence_padding(token_ids_list), dtype=torch.long)
valid_dataloader = DataLoader(TensorDataset(token_ids_tensor), batch_size=batch_size)
valid_sen_emb = []
for token_ids in tqdm(valid_dataloader, desc='Evaluate'):
token_ids = token_ids[0].to(device)
output = self.predict(token_ids)
valid_sen_emb.append(output.cpu())
valid_sen_emb = torch.cat(valid_sen_emb, dim=0)
return valid_sen_emb
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=ContrastiveLoss(margin=0.8),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
def __init__(self):
super().__init__()
self.best_val_auc = 0
def on_dataloader_end(self, logs=None):
model.train_dataloader = DataLoader(MyDataset(sec_train_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def on_epoch_end(self, global_step, epoch, logs=None):
val_auc = self.evaluate(valid_dataloader)
if val_auc >= self.best_val_auc:
self.best_val_auc = val_auc
model.save_weights('sec_best_weights.pt')
print(f'val_auc: {val_auc:.5f}, best_val_auc: {self.best_val_auc:.5f}\n')
def evaluate(self, data):
embeddings1, embeddings2, labels = [], [], []
for (batch_token1_ids, batch_token2_ids), batch_labels in tqdm(data):
embeddings1.append(model.predict(batch_token1_ids).cpu())
embeddings2.append(model.predict(batch_token2_ids).cpu())
labels.append(batch_labels.cpu())
embeddings1 = torch.cat(embeddings1).numpy()
embeddings2 = torch.cat(embeddings2).numpy()
labels = torch.cat(labels).numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
auc = roc_auc_score(labels, cosine_scores)
return auc
if __name__ == '__main__':
train_dataloader = DataLoader(MyDataset(sec_train_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(sec_dev_file), batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=10,
steps_per_epoch=1000,
callbacks=[evaluator]
)
else:
model.load_weights('sec_best_weights.pt')
{
"cells": [
{
"cell_type": "markdown",
"source": [
"### 获得单例文本预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"from config import *\n",
"from utils import *\n",
"\n",
"q_std_list, fst_q_std_sentence_embeddings, q_all, fst_q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"_, sec_q_std_sentence_embeddings, _, sec_q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, sec_q_std_vectors_file, q_corpus_file, sec_q_corpus_vectors_file)\n",
"\n",
"def get_fst_topK(text, K=10):\n",
" text_embedding = model1.encode([text])[0].numpy()\n",
" sims_with_std = np.array(cos_sim4matrix_2(text_embedding, fst_q_std_sentence_embeddings))\n",
" sort_idx = np.argsort(-sims_with_std)[:K]\n",
" sims_q_sort = [q_std_list[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(sims_q_sort, sims_values))\n",
" return (result)\n",
"\n",
"def get_sec_topK(self, text, K=20):\n",
" text_embedding = self.model.encode([text])[0]\n",
" sims_with_std = np.array(self.cos_sim4matrix_2(text_embedding, self.std_sentence_embeddings))\n",
" sort_idx = np.argsort(-sims_with_std)[:K]\n",
" sims_q_sort = [self.q_std_list[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(sims_q_sort, sims_values))\n",
" return (result)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import numpy as np\n",
"from config import *\n",
"from utils import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model as model1\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model as model2\n",
"\n",
"text = input()\n",
"\n",
"# 第一阶段召回\n",
"result_first = get_fst_topK(text=text)\n",
"print('第一阶段\\n', result_first[0:20])\n",
"first_intents = [v[0] for v in result_first]\n",
"\n",
"# 第二阶段召回\n",
"a_texts_embeddings_2 = np.array(model2.encode([text]))\n",
"b_texts_embeddings_2 = np.array([sec_q_all_sentence_embeddings_dict[v] for v in first_intents])\n",
"sims_with_std = cos_sim4matrix_2(a_texts_embeddings_2, b_texts_embeddings_2).reshape(-1)\n",
"sort_idx = np.argsort(-sims_with_std).tolist()\n",
"intents_sort = [first_intents[idx] for idx in sort_idx]\n",
"sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
"result_second = list(zip(intents_sort, sims_values))\n",
"print('第二阶段\\n', result_second[0:20])"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"orig_nbformat": 4,
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## 计算向量,统计结果等\n",
"### 获得基于第二阶段模型所得标问向量"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model\n",
"from config import *\n",
"\n",
"# get list\n",
"q_std_list = pd.read_csv(q_std_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"q_corpus = pd.read_csv(q_corpus_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"\n",
"# get embeddings\n",
"q_std_sentence_embeddings = model.encode(q_std_list)\n",
"print('保存二阶段标准问向量:', sec_q_std_vectors_file)\n",
"np.save(sec_q_std_vectors_file, q_std_sentence_embeddings)\n",
"q_corpus_sentence_embeddings = model.encode(q_corpus)\n",
"print('保存二阶段所有语料向量:', sec_q_corpus_vectors_file)\n",
"np.save(sec_q_corpus_vectors_file, q_corpus_sentence_embeddings)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"### 获得所有待测数据第一阶段模型预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from config import *\n",
"from utils import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model as model1\n",
"\n",
"path_list = fst_eval_path_list\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"\n",
"for i, input_path in enumerate(path_list):\n",
" print(f'开始评估新语料: {i}'.center(120, '='))\n",
" df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
" df_eval = df_eval[~pd.isna(df_eval.q_sim)]\n",
" output_path = input_path[:-4] + '_result.tsv'\n",
" print('input_path: ', input_path, 'output_path: ', output_path)\n",
"\n",
" print(\"目标语料数量:\", df_eval.shape, '标问数量:', df_eval.q_std.nunique(), '相似问数量:',\n",
" df_eval.q_sim.nunique(), '标语料去重后数量', df_eval.drop_duplicates([\"q_std\", \"q_sim\"]).shape[0])\n",
"\n",
" ## v1 对于都是有一个是小量的情况下\n",
" df_eval = cal_performance(model1, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=10)\n",
" df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"### 获得所有待测数据第二阶段模型预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import os\n",
"import torch\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model as model2\n",
"import numpy as np\n",
"import pandas as pd\n",
"from config import *\n",
"\n",
"path_list = sec_eval_path_list\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, sec_q_std_vectors_file, q_corpus_file, sec_q_corpus_vectors_file)\n",
"# 标问和向量的映射\n",
"corpus_sentence_embeddings_dict = {q_std_list[i]: q_std_sentence_embeddings[i] for i in range(0, len(q_std_list))}\n",
"\n",
"for i, input_path in enumerate(path_list):\n",
" print(f'开始评估新语料: {i}'.center(120, '='))\n",
" df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
" output_path = input_path[:-4] + '_result.tsv'\n",
" print('input_path: ', input_path, 'output_path: ', output_path)\n",
"\n",
" texts = df_eval.q_sim.tolist()\n",
" texts_in = [v for v in texts if v in q_all_sentence_embeddings_dict.keys()]\n",
" texts_out = [v for v in texts if v not in q_all_sentence_embeddings_dict.keys()]\n",
" texts_out_embeddings = model2.encode(texts_out) if texts_out else []\n",
" texts_embeddings_dict_1 = {texts_in[i]: q_all_sentence_embeddings_dict[texts_in[i]] for i in range(0, len(texts_in))}\n",
" texts_embeddings_dict_2 = {texts_out[i]: texts_out_embeddings[i] for i in range(0, len(texts_out))}\n",
" texts_embeddings_dict = {**texts_embeddings_dict_1, **texts_embeddings_dict_2}\n",
" print('目标语料编码数量:——>', len(texts_embeddings_dict))\n",
"\n",
" def get_sec_result(text, std_texts):\n",
" '''预测模型2的结果\n",
" '''\n",
" a_text_embeddings = texts_embeddings_dict[text] # 获取改相似问在模型2中的向量\n",
" b_text_embeddings = np.array([corpus_sentence_embeddings_dict[v] for v in std_texts]) # 拿到模型1召回的候选标问在模型2中的向量\n",
" sims_with_std = cos_sim4matrix_2(a_text_embeddings, b_text_embeddings).reshape(-1)\n",
" sort_idx = np.argsort(-sims_with_std).tolist()\n",
" intents_sort = [std_texts[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(intents_sort, sims_values))\n",
" return (result)\n",
"\n",
" # 模型1预测结果\n",
" df_eval['q_std_pred_list_v1'] = df_eval.q_std_pred_list_v1.apply(lambda v: eval(v))\n",
"\n",
" # 模型2预测结果\n",
" df_eval['q_std_pred_list_2'] = df_eval.apply(lambda row: get_sec_result(row['q_sim'], row['q_std_pred_list_v1']), axis=1)\n",
"\n",
" df_eval['q_std_pred_list_2_v1'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[0] for k in v])\n",
" df_eval['q_std_pred_list_2_v2'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[1] for k in v])\n",
" df_eval['q_std_pred_2'] = df_eval.q_std_pred_list_2_v1.apply(lambda v: v[0])\n",
" df_eval['prob_2'] = df_eval.q_std_pred_list_2_v2.apply(lambda v: v[0])\n",
"\n",
" df_eval['r1'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:1] else 0, axis=1)\n",
" df_eval['r3'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:3] else 0, axis=1)\n",
" df_eval['r5'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:5] else 0, axis=1)\n",
" df_eval['r10'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:10] else 0, axis=1)\n",
"\n",
" # 扣除不包含的标准问\n",
" print('目标语料准确率:——>')\n",
" print(df_eval.shape)\n",
" df_1 = df_eval\n",
" print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
" df_2 = df_eval[df_eval.t10 == 1]\n",
" print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
" df_3 = df_eval\n",
" print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
"\n",
" ##扣除不包含的标准问\n",
" print('目标语料准确率[有效标问]:——>')\n",
" df_k_need = df_eval[df_eval.ifin == 1]\n",
" print(df_k_need.shape)\n",
" df_1 = df_k_need\n",
" print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
" df_2 = df_k_need[df_k_need.t10 == 1]\n",
" print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
" df_3 = df_k_need\n",
" print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
" df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.8 64-bit ('base': conda)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
},
"interpreter": {
"hash": "509cf8fb3e64af7327dbc287206db89f13b65f7dad389d82b165e29388b2e60b"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
import torch
from torch import Tensor
import numpy as np
import pandas as pd
def pytorch_cos_sim(a: Tensor, b: Tensor):
if not isinstance(a, torch.Tensor):
a = torch.tensor(a)
if not isinstance(b, torch.Tensor):
b = torch.tensor(b)
if len(a.shape) == 1:
a = a.unsqueeze(0)
if len(b.shape) == 1:
b = b.unsqueeze(0)
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
return torch.mm(a_norm, b_norm.transpose(0, 1))
def cos_sim(vector_a, vector_b):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
cos = num / denom
sim = 0.5 + 0.5 * cos
return sim
def cos_sim_1(vector_a, vector_b):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a = np.mat(vector_a)
vector_b = np.mat(vector_b)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
cos = num / denom
return cos
def cos_sim4matrix(arr, brr):
return 0.5 + 0.5 * (arr.dot(brr.T) / (np.sqrt(np.sum(arr * arr)) * np.sqrt(np.sum(brr * brr, axis = 1))))
def cos_sim4matrix_2(arr, brr):
return (arr.dot(brr.T) / (np.sqrt(np.sum(arr * arr)) * np.sqrt(np.sum(brr * brr, axis=1))))
def read_q_std_q_corpus(q_std_file, q_std_vectors_file, q_corpus_file, q_corpus_vectors_file):
'''读取q_std、q_corpus语料和向量
'''
print('读取标准问及其向量'.center(60, '-'))
q_std_list = pd.read_csv(q_std_file, sep="\t", names=['c']).c.tolist()
q_std_sentence_embeddings = np.load(q_std_vectors_file)
print('标准问shape:', q_std_sentence_embeddings.shape, len(q_std_list))
print('读取所有语料及其向量'.center(60, '-'))
q_all = pd.read_csv(q_corpus_file, sep="\t", names=['c']).c.tolist()
q_all_sentence_embeddings = np.load(q_corpus_vectors_file)
q_all_sentence_embeddings_dict = {q_all[i]: q_all_sentence_embeddings[i] for i in range(0, len(q_all))}
print('所有语料shape', q_all_sentence_embeddings.shape, len(q_all))
return q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict
def cal_performance(model, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=20):
'''计算召回topK的指标
'''
texts = df_eval.q_sim.tolist()
texts_in = [v for v in texts if v in q_all_sentence_embeddings_dict.keys()]
texts_out = [v for v in texts if v not in q_all_sentence_embeddings_dict.keys()]
texts_out_embeddings = model.encode(texts_out) if texts_out else []
texts_embeddings_dict_1 = {texts_in[i]: q_all_sentence_embeddings_dict[texts_in[i]] for i in range(0, len(texts_in))}
texts_embeddings_dict_2 = {texts_out[i]: texts_out_embeddings[i] for i in range(0, len(texts_out))}
texts_embeddings_dict = {**texts_embeddings_dict_1, **texts_embeddings_dict_2}
print(f'计算相似度 K= {K}'.center(60, '-'))
df_eval['ifin'] = df_eval.q_std.apply(lambda v: 1 if v in q_std_list else 0)
print("目标语料标问是否存在:——>", df_eval.groupby("ifin")["ifin"].count())
print('----计算所有query和q_std的相似度')
x_texts_embeddings = np.array([texts_embeddings_dict[x_text] for x_text in texts])
cos_scores = pytorch_cos_sim(x_texts_embeddings, q_std_sentence_embeddings).cpu()
print('shape: ', x_texts_embeddings.shape, q_std_sentence_embeddings.shape, cos_scores.shape)
print(f'----为每条相似问找到相似度最大的{K}条标问'.center(60, '-'))
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, K, dim=1, largest=True, sorted=False)
cos_scores_top_k_values = cos_scores_top_k_values.tolist()
cos_scores_top_k_idx = cos_scores_top_k_idx.tolist()
cos_q_corpus_sort = [[q_std_list[v] for v in vlist] for vlist in cos_scores_top_k_idx] # 最相似的TopK个标问
result = [list(zip(cos_q_corpus_sort[i], cos_scores_top_k_values[i])) for i in range(0, len(texts))]
texts_topk_dict = {texts[i]: result[i] for i in range(0, len(texts))}
# 拿到每个相似问的预测结果,topK的预测标问和对应的相似度
df_eval['q_std_pred_list'] = df_eval.q_sim.map(texts_topk_dict)
# 计算q_sim和q_std之间的相似度
df_eval['prob_with_std'] = df_eval.apply(lambda row: cos_sim_1(texts_embeddings_dict[row['q_sim']], q_std_sentence_embeddings[q_std_list.index(row['q_std'])]), axis=1)
df_eval.loc[:, 'q_std_pred'] = df_eval.q_std_pred_list.apply(lambda v: v[0][0])
df_eval.loc[:, 'prob'] = df_eval.q_std_pred_list.apply(lambda v: v[0][1])
# df_eval.loc[:,'q_std_pred_list_pair']=df_eval.apply(lambda row: [(row['q_std'],row['q_sim'],v[0],v[1]) for v in row['q_std_pred_list']],axis=1)
df_eval['q_std_pred_list_v1'] = df_eval.q_std_pred_list.apply(lambda v: [k[0] for k in v]) # 只保留预测的标准问句
df_eval['q_std_pred_list_v2'] = df_eval.q_std_pred_list.apply(lambda v: [k[1] for k in v]) # 只保留预测的概率
df_eval['t1'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_v1'][0:1] else 0, axis=1)
df_eval['t3'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_v1'][0:3] else 0, axis=1)
df_eval['t5'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_v1'][0:5] else 0, axis=1)
df_eval['t10'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_v1'][0:10] else 0, axis=1)
print('----模型准确率: ', df_eval.t1.sum() / df_eval.shape[0], df_eval.t3.sum() / df_eval.shape[0], df_eval.t5.sum() / df_eval.shape[0], df_eval.t10.sum() / df_eval.shape[0])
df_eval_need = df_eval[df_eval.ifin == 1]
print('----模型准确率:[有效标问]:', df_eval_need.t1.sum() / df_eval_need.shape[0], df_eval_need.t3.sum() / df_eval_need.shape[0], df_eval_need.t5.sum() / df_eval_need.shape[0], df_eval_need.t10.sum() / df_eval_need.shape[0])
return df_eval
\ No newline at end of file
#! -*- coding:utf-8 -*-
# 利用pca压缩句向量
# 从768维压缩到128维,指标从81.82下降到80.10
from task_sentence_embedding_sup_CosineMSELoss import model, train_dataloader, Model, device, valid_dataloader, evaluate
from bert4torch.snippets import get_pool_emb
from sklearn.decomposition import PCA
import numpy as np
import torch
import torch.nn as nn
new_dimension = 128 # 压缩到的维度
train_embeddings = []
for token_ids_list, labels in train_dataloader:
for token_ids in token_ids_list:
train_embeddings.append(model.encode(token_ids))
# if len(train_embeddings) >= 20:
# break
train_embeddings = torch.cat(train_embeddings, dim=0).cpu().numpy()
print('train_embeddings done, start pca training...')
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)
print('PCA training done...')
# 定义bert上的模型结构
class NewModel(Model):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.dense = nn.Linear(768, new_dimension, bias=False)
self.dense.weight = torch.nn.Parameter(torch.tensor(pca_comp, device=device))
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pool_cls = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pool_cls, attention_mask, self.pool_method)
output = self.dense(output)
return output
new_model = NewModel().to(device)
new_model.load_weights('best_model.pt', strict=False)
print('Start evaludating...')
val_consine = evaluate(new_model, valid_dataloader)
print(f'val_consine: {val_consine:.5f}\n')
\ No newline at end of file
#! -*- coding:utf-8 -*-
# 模型压缩,仅保留bert-base部分层
# 初测测试指标从80%降到77%左右,未细测
from task_sentence_embedding_sup_CosineMSELoss import model, train_dataloader, Model, device, valid_dataloader, evaluate
from bert4torch.snippets import Callback, get_pool_emb
import torch.optim as optim
import torch.nn as nn
from bert4torch.models import build_transformer_model
train_token_ids, train_embeddings = [], []
for token_ids_list, labels in train_dataloader:
train_token_ids.extend(token_ids_list)
for token_ids in token_ids_list:
train_embeddings.append(model.encode(token_ids))
# if len(train_embeddings) >= 20:
# break
new_train_dataloader = list(zip(train_token_ids, train_embeddings))
print('train_embeddings done, start model distillation...')
# 仅取固定的层
class NewModel(Model):
def __init__(self, **kwargs):
super().__init__(**kwargs)
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
self.bert = build_transformer_model(config_path=config_path, with_pool=True, segment_vocab_size=0, keep_hidden_layers=[1,4,7])
def forward(self, token_ids):
hidden_state, pooler = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pooler, attention_mask, self.pool_method)
return output
new_model = NewModel().to(device)
new_model.compile(
loss=nn.MSELoss(),
optimizer=optim.Adam(new_model.parameters(), lr=2e-5),
)
new_model.load_weights('best_model.pt', strict=False) # 加载大模型的部分层
val_consine = evaluate(new_model, valid_dataloader)
print('init val_cosine after distillation: ', val_consine)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(new_model, valid_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# new_model.save_weights('best_model.pt')
print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
new_model.fit(new_train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
new_model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 原项目:https://kexue.fm/archives/8847
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import spearmanr
from tqdm import tqdm
import sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling, task_name = 'cls', 'ATEC' # debug使用
print('pooling: ', pooling, ' task_name: ', task_name)
assert task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
maxlen = 64 if task_name != 'PAWSX' else 128
batch_size = 32
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], int(l[2])))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text1, text2, label in batch:
for text in [text1, text2]:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.float, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
def forward(self, token_ids):
hidden_state, pooler = self.bert([token_ids])
sem_emb = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return sem_emb
model = Model().to(device)
class MyLoss(nn.Module):
def forward(self, y_pred, y_true):
# 1. 取出真实的标签
y_true = y_true[::2] # tensor([1, 0, 1]) 真实的标签
# 2. 对输出的句子向量进行l2归一化 后面只需要对应为相乘 就可以得到cos值了
norms = (y_pred ** 2).sum(axis=1, keepdims=True) ** 0.5
# y_pred = y_pred / torch.clip(norms, 1e-8, torch.inf)
y_pred = y_pred / norms
# 3. 奇偶向量相乘
y_pred = torch.sum(y_pred[::2] * y_pred[1::2], dim=1) * 20
# 4. 取出负例-正例的差值
y_pred = y_pred[:, None] - y_pred[None, :] # 这里是算出所有位置 两两之间余弦的差值
# 矩阵中的第i行j列 表示的是第i个余弦值-第j个余弦值
y_true = y_true[:, None] < y_true[None, :] # 取出负例-正例的差值
y_true = y_true.float()
y_pred = y_pred - (1 - y_true) * 1e12
y_pred = y_pred.view(-1)
y_pred = torch.cat((torch.tensor([0.0], device=device), y_pred), dim=0) # 这里加0是因为e^0 = 1相当于在log中加了1
return torch.logsumexp(y_pred, dim=0)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=MyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = self.evaluate(valid_dataloader)
test_consine = self.evaluate(test_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'valid_consine: {val_consine:.5f}, test_consine: {test_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
# 定义评价函数
def evaluate(self, data):
embeddings1, embeddings2, labels = [], [], []
for batch_token_ids, batch_labels in tqdm(data, desc='Evaluate'):
embeddings = model.predict(batch_token_ids)
embeddings1.append(embeddings[::2])
embeddings2.append(embeddings[1::2])
labels.append(batch_labels[::2])
embeddings1 = torch.cat(embeddings1).cpu().numpy()
embeddings2 = torch.cat(embeddings2).cpu().numpy()
labels = torch.cat(labels).cpu().numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=5, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# loss: ContrastiveLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
from bert4torch.losses import ContrastiveLoss
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import spearmanr
from tqdm import tqdm
import sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling, task_name = 'cls', 'ATEC' # debug使用
print('pooling: ', pooling, ' task_name: ', task_name)
assert task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
maxlen = 64 if task_name != 'PAWSX' else 128
batch_size = 32
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], int(l[2])))
return D
def collate_fn(batch):
batch_token1_ids, batch_token2_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token1_ids, _ = tokenizer.encode(text1, maxlen=maxlen)
batch_token1_ids.append(token1_ids)
token2_ids, _ = tokenizer.encode(text2, maxlen=maxlen)
batch_token2_ids.append(token2_ids)
batch_labels.append([int(label>2.5) if task_name == 'STS-B' else label])
batch_token1_ids = torch.tensor(sequence_padding(batch_token1_ids), dtype=torch.long, device=device)
batch_token2_ids = torch.tensor(sequence_padding(batch_token2_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.float, device=device)
return (batch_token1_ids, batch_token2_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
def forward(self, token1_ids, token2_ids):
hidden_state1, pool_cls1 = self.bert([token1_ids])
pool_emb1 = get_pool_emb(hidden_state1, pool_cls1, token1_ids.gt(0).long(), self.pool_method)
hidden_state2, pool_cls2 = self.bert([token2_ids])
pool_emb2 = get_pool_emb(hidden_state2, pool_cls2, token2_ids.gt(0).long(), self.pool_method)
distance = 1- torch.cosine_similarity(pool_emb1, pool_emb2)
return distance
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pooler, attention_mask, self.pool_method)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=ContrastiveLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = self.evaluate(valid_dataloader)
test_consine = self.evaluate(test_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'valid_consine: {val_consine:.5f}, test_consine: {test_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
# 定义评价函数
def evaluate(self, data):
embeddings1, embeddings2, labels = [], [], []
for (batch_token1_ids, batch_token2_ids), batch_labels in tqdm(data, desc='Evaluate'):
embeddings1.append(model.predict(batch_token1_ids).cpu())
embeddings2.append(model.predict(batch_token2_ids).cpu())
labels.append(batch_labels)
embeddings1 = torch.cat(embeddings1).numpy()
embeddings2 = torch.cat(embeddings2).numpy()
labels = torch.cat(labels).cpu().numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=5, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# loss: CosineMSELoss(cos + mse_loss)
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import spearmanr
from tqdm import tqdm
import sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling, task_name = 'cls', 'ATEC' # debug使用
print('pooling: ', pooling, ' task_name: ', task_name)
assert task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
maxlen = 64 if task_name != 'PAWSX' else 128
batch_size = 32
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) != 3:
continue
text1, text2, label = l
label = int(label)/5 if task_name == 'STS-B' else int(label)
D.append((text1, text2, label))
return D
def collate_fn(batch):
batch_token1_ids, batch_token2_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token1_ids, _ = tokenizer.encode(text1, maxlen=maxlen)
batch_token1_ids.append(token1_ids)
token2_ids, _ = tokenizer.encode(text2, maxlen=maxlen)
batch_token2_ids.append(token2_ids)
batch_labels.append([label])
batch_token1_ids = torch.tensor(sequence_padding(batch_token1_ids), dtype=torch.long, device=device)
batch_token2_ids = torch.tensor(sequence_padding(batch_token2_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.float, device=device)
return (batch_token1_ids, batch_token2_ids), batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.valid.data'), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.test.data'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
def forward(self, token1_ids, token2_ids):
hidden_state1, pooler1 = self.bert([token1_ids])
pool_emb1 = get_pool_emb(hidden_state1, pooler1, token1_ids.gt(0).long(), self.pool_method)
hidden_state2, pooler2 = self.bert([token2_ids])
pool_emb2 = get_pool_emb(hidden_state2, pooler2, token2_ids.gt(0).long(), self.pool_method)
return torch.cosine_similarity(pool_emb1, pool_emb2)
def encode(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
attention_mask = token_ids.gt(0).long()
output = get_pool_emb(hidden_state, pooler, attention_mask, self.pool_method)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.MSELoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5)
)
# 定义评价函数
def evaluate(model, data):
embeddings1, embeddings2, labels = [], [], []
for (batch_token1_ids, batch_token2_ids), batch_labels in tqdm(data, desc='Evaluate'):
embeddings1.append(model.encode(batch_token1_ids))
embeddings2.append(model.encode(batch_token2_ids))
labels.append(batch_labels)
embeddings1 = torch.cat(embeddings1).cpu().numpy()
embeddings2 = torch.cat(embeddings2).cpu().numpy()
labels = torch.cat(labels).cpu().numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = evaluate(model, valid_dataloader)
test_consine = evaluate(model, test_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'valid_consine: {val_consine:.5f}, test_consine: {test_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=5, steps_per_epoch=None, callbacks=[evaluator])
#! -*- coding:utf-8 -*-
# loss: InfoNCE(即sentence_transformer中的MultiNegativeRankingLoss)
# 样本都是正负样本对,因此构造(正,正,负)的三元组时候,正样本对(正,正1)随机抽样负样本为(正,正1,负)
# 负样本对(正,负)重复正样本对(正,正,负)
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, ListDataset, get_pool_emb, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import paired_cosine_distances
from scipy.stats import spearmanr
import random
from tqdm import tqdm
import sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling, task_name = 'cls', 'ATEC' # debug使用
print('pooling: ', pooling, ' task_name: ', task_name)
assert task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']
assert pooling in {'first-last-avg', 'last-avg', 'cls', 'pooler'}
maxlen = 64 if task_name != 'PAWSX' else 128
batch_size = 32
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# ===========================数据预处理===========================
# 训练
def collate_fn(batch):
texts_list = [[] for _ in range(3)]
for texts in batch:
for i, text in enumerate(texts):
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
texts_list[i].append(token_ids)
for i, texts in enumerate(texts_list):
texts_list[i] = torch.tensor(sequence_padding(texts), dtype=torch.long, device=device)
labels = torch.arange(texts_list[0].size(0), device=texts_list[0].device)
return texts_list, labels
# 加载数据集
def get_data(filename):
train_data, all_texts = {}, []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) != 3:
continue
text1, text2, label = l
label = str(int(int(label) > 2.5)) if task_name == 'STS-B' else label
if text1 not in train_data:
train_data[text1] = {'0': set(), '1': set()}
train_data[text1][label].add(text2)
if text2 not in train_data:
train_data[text2] = {'0': set(), '1': set()}
train_data[text2][label].add(text1)
all_texts.extend([text1, text2])
train_samples = []
for sent1, others in train_data.items():
if len(others['1']) == 0:
others['1'] = [sent1] # 没有正样本,使用自身作为正阳本,这里其实就是无监督
elif len(others['0']) == 0:
others['0'] = [random.choice(all_texts)] # 没有负样本,随机挑选一个负样本
# sentence bert的逻辑是下面两个都加进去,这样的问题是如果shuffle=False,处于同一个batch中,相似句可能label给的负样本
if random.random() < 0.5:
train_samples.append((sent1, random.choice(list(others['1'])), random.choice(list(others['0']))))
else:
train_samples.append((random.choice(list(others['1'])), sent1, random.choice(list(others['0']))))
return train_samples
train_data = get_data(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data')
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) == 3:
D.append((l[0], l[1], int(l[2])))
return D
def collate_fn_eval(batch):
batch_token1_ids, batch_token2_ids, batch_labels = [], [], []
for text1, text2, label in batch:
token1_ids, _ = tokenizer.encode(text1, maxlen=maxlen)
batch_token1_ids.append(token1_ids)
token2_ids, _ = tokenizer.encode(text2, maxlen=maxlen)
batch_token2_ids.append(token2_ids)
batch_labels.append([label])
batch_token1_ids = torch.tensor(sequence_padding(batch_token1_ids), dtype=torch.long, device=device)
batch_token2_ids = torch.tensor(sequence_padding(batch_token2_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return (batch_token1_ids, batch_token2_ids), batch_labels.flatten()
# 加载数据集
valid_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.valid.data'), batch_size=batch_size, collate_fn=collate_fn_eval)
test_dataloader = DataLoader(MyDataset(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.test.data'), batch_size=batch_size, collate_fn=collate_fn_eval)
# 建立模型
class Model(BaseModel):
def __init__(self, pool_method='cls', scale=20.0):
super().__init__()
self.pool_method = pool_method
with_pool = 'linear' if pool_method == 'pooler' else True
output_all_encoded_layers = True if pool_method == 'first-last-avg' else False
self.bert = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0,
with_pool=with_pool, output_all_encoded_layers=output_all_encoded_layers)
self.scale = scale
def forward(self, token_ids_list):
reps = []
for token_ids in token_ids_list:
hidden_state1, pooler = self.bert([token_ids])
rep = get_pool_emb(hidden_state1, pooler, token_ids.gt(0).long(), self.pool_method)
reps.append(rep)
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = self.cos_sim(embeddings_a, embeddings_b) * self.scale # [btz, btz]
return scores
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_state, pooler = self.bert([token_ids])
output = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return output
@staticmethod
def cos_sim(a, b):
a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
return torch.mm(a_norm, b_norm.transpose(0, 1))
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_consine = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_consine = self.evaluate(valid_dataloader)
test_consine = self.evaluate(test_dataloader)
if val_consine > self.best_val_consine:
self.best_val_consine = val_consine
# model.save_weights('best_model.pt')
print(f'valid_consine: {val_consine:.5f}, test_consine: {test_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')
# 重新生成dataloader,重新random选择样本
train_data = get_data(f'F:/Projects/data/corpus/sentence_embedding/{task_name}/{task_name}.train.data')
model.train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义评价函数
def evaluate(self, data):
embeddings1, embeddings2, labels = [], [], []
for (batch_token_ids1, batch_token_ids2), batch_labels in tqdm(data, desc='Evaluate'):
embeddings1.append(model.predict(batch_token_ids1))
embeddings2.append(model.predict(batch_token_ids2))
labels.append(batch_labels)
embeddings1 = torch.cat(embeddings1).cpu().numpy()
embeddings2 = torch.cat(embeddings2).cpu().numpy()
labels = torch.cat(labels).cpu().numpy()
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) # cosine距离是1-paired
eval_pearson_cosine, _ = spearmanr(labels, cosine_scores)
return eval_pearson_cosine
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader,
epochs=10,
steps_per_epoch=None,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment