Commit 66a1d0d0 authored by yangzhong's avatar yangzhong
Browse files

提交初版bert4torch project

parents
Pipeline #519 canceled with stages
#! -*- coding: utf-8 -*-
# 微调T5 PEGASUS做Seq2Seq任务, 使用到是BertTokenizer
# 介绍链接:https://kexue.fm/archives/8209
# 权重转换脚本: https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_t5_pegasus.py
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import jieba
jieba.initialize()
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
valid_len = 1000
# bert配置
pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path = pretrain_model + 'config.json'
checkpoint_path = pretrain_model + 'pytorch_model.bin'
dict_path = pretrain_model + 'vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
tokenizer = Tokenizer(
dict_path,
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False)
)
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append([0] + token_ids)
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
model = build_transformer_model(
config_path,
checkpoint_path,
model='mt5.1.1',
segment_vocab_size=0,
attention_scale=False,
is_dropout=True,
tie_emb_prj_weight=False, # 独立权重
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode([int(i) for i in output_ids.cpu().numpy()])
autotitle = AutoTitle(start_id=0, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data[:valid_len]) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
evaluator = Evaluator()
just_show()
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# 微调uer版T5做Seq2Seq任务
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, seed_everything
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 50
steps_per_epoch = None
valid_len = 1000
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids, batch_titile_ids = [], []
for title, content in batch:
token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
batch_content_ids.append(token_ids)
token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_titile_ids.append(token_ids)
batch_content_ids = torch.tensor(sequence_padding(batch_content_ids), dtype=torch.long, device=device)
batch_titile_ids = torch.tensor(sequence_padding(batch_titile_ids), dtype=torch.long, device=device)
return [[batch_content_ids], [batch_titile_ids[:, :-1]]], batch_titile_ids[:, 1:].flatten()
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
model = build_transformer_model(
config_path,
checkpoint_path,
model='t5.1.0',
segment_vocab_size=0,
attention_scale=False,
is_dropout=True,
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, y_true):
_, _, y_pred = outputs
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=tokenizer._token_end_id, maxlen=max_t_len, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data[:valid_len]) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
import json, os
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate
from bert4torch.snippets import AutoRegressiveDecoder, Callback, ListDataset
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
maxlen = 256
batch_size = 16
epochs = 50
steps_per_epoch = None
valid_len = 1000
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
title, content = l['title'], l['abst']
D.append((title, content))
return D
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def collate_fn(batch):
"""单条样本格式:[CLS]文章[SEP]标题[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for title, content in batch:
token_ids, segment_ids = tokenizer.encode(content, title, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataset = MyDataset('F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json')
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_, y_pred = outputs
y_true, y_mask = target
y_true = y_true[:, 1:]# 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return super().forward(y_pred, y_true)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 2e-5))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
_, y_pred = model.predict([token_ids, segment_ids])
return y_pred[:, -1, :]
def generate(self, text, topk=1, topp=0.95):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids], topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, device=device)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, steps, epoch, logs=None):
just_show()
metrics = self.evaluate(valid_dataset.data[:valid_len]) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
# model.save_weights('./best_model.pt') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '),
smoothing_function=self.smooth)
rouge_1, rouge_2, rouge_l, bleu = rouge_1/total, rouge_2/total, rouge_l/total, bleu/total
return {'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu}
def just_show():
s1 = u'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2 = u'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
if __name__ == '__main__':
just_show()
evaluator = Evaluator()
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# SimBERT预训练代码,也可用于微调,微调方式用其他方式比如sentence_bert的可能更好
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, AutoRegressiveDecoder, Callback, get_pool_emb
from bert4torch.tokenizers import Tokenizer, load_vocab
# 基本信息
maxlen = 32
batch_size = 32
# 这里加载的是simbert权重,在此基础上用自己的数据继续pretrain/finetune
# 自己从头预训练也可以直接加载bert/roberta等checkpoint
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
text, synonym = synonyms[:2]
text, synonym = truncate(text), truncate(synonym)
token_ids, segment_ids = tokenizer.encode(text, synonym, maxlen=maxlen * 2)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
token_ids, segment_ids = tokenizer.encode(synonym, text, maxlen=maxlen * 2)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear',
with_mlm='linear', application='unilm', keep_tokens=keep_tokens)
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred):
y_true = self.get_labels_of_similarity(y_pred) # 构建标签
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
similarities = similarities - torch.eye(y_pred.shape[0], device=device) * 1e12 # 排除对角线
similarities = similarities * 30 # scale
loss = F.cross_entropy(similarities, y_true)
return loss
def get_labels_of_similarity(self, y_pred):
idxs = torch.arange(0, y_pred.shape[0], device=device)
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
labels = idxs_1.eq(idxs_2).float()
return labels
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
choice = 'similarity' # train generate similarity
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
elif choice == 'generate':
print(gen_synonyms('我想去北京玩玩可以吗', 10, 10))
elif choice == 'similarity':
target_text = '我想去首都北京玩玩'
text_list = ['我想去北京玩', '北京有啥好玩的吗?我想去看看', '好渴望去北京游玩啊']
Z = cal_sen_emb([target_text]+text_list)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
similarity = torch.matmul(Z[1:], Z[0])
for i, line in enumerate(text_list):
print(f'cos_sim: {similarity[i].item():.4f}, tgt_text: "{target_text}", cal_text: "{line}"')
else:
model.load_weights('./best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 64
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = '/bert4torch/datasets/bert-base-chinese/config.json'
checkpoint_path = '/bert4torch/datasets/bert-base-chinese/pytorch_model.bin'
dict_path = '/bert4torch/datasets/bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5)) # fp32
# model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), use_amp=True) # fp16
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from bert4torch.models import BaseModelDDP
import os
maxlen = 256
batch_size = 64
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path = '/bert4torch/datasets/bert-base-chinese/config.json'
checkpoint_path = '/bert4torch/datasets/bert-base-chinese/pytorch_model.bin'
dict_path = '/bert4torch/datasets/bert-base-chinese/vocab.txt'
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
local_rank = int(os.environ['LOCAL_RANK'])
print("local_rank ", local_rank)
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
torch.distributed.init_process_group(backend='nccl')
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
#train_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
train_dataset = MyDataset('/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model = BaseModelDDP(model, master_rank=0, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=False)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.module.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5)) # fp32
# 定义使用的loss和optimizer,这里支持自定义
# model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), use_amp=True) # fp16
#compile(self, loss, optimizer, scheduler=None, max_grad_norm=None, use_amp=False, metrics=None, adversarial_train={'name': ''}):
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.module.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
#print("#### scores: ", scores)
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import os
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path = '/datasets/bert-base-chinese/config.json'
checkpoint_path = '/datasets/bert-base-chinese/pytorch_model.bin'
dict_path = '/datasets/bert-base-chinese/vocab.txt'
local_rank = int(os.environ['LOCAL_RANK'])
print("local_rank ", local_rank)
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# DDP init
dist.init_process_group("nccl", init_method='env://')
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataset = MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.module.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
# DDP
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.module.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
#model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
model.module.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
NUM=$(($(rocm-smi |sed -n '/DCU/,/===/ p'|wc -l)-2))
START=0
if [ $# -gt 0 ];then ##DCU Number
NUM=$1
fi
if [ $# -gt 1 ];then ##The First DCU ID
START=$2
fi
LAST=$((START+NUM-1))
export HIP_VISIBLE_DEVICES=$(seq -s, ${START} ${LAST})
export HSA_FORCE_FINE_GRAIN_PCIE=1
logfile=bert_base_${NUM}dcu_`date +%Y%m%d%H%M%S`.log
python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py 2>&1 | tee $logfile
logfile=bert_base_`date +%Y%m%d%H%M%S`.log
python3 crf.py 2>&1 | tee $logfile
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path = '/datasets/bert-base-chinese/config.json'
checkpoint_path = '/datasets/bert-base-chinese/pytorch_model.bin'
dict_path = '/datasets/bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf 级联方法,一阶段识别BIO,二阶段识别对应的分类
# 参考博客:https://zhuanlan.zhihu.com/p/166496466
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 98.11; entity_level: 96.23
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['LOC', 'PER', 'ORG']
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels, batch_entity_ids, batch_entity_labels = [], [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
entity_ids, entity_labels = [], []
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = 1 # 标记B
labels[start + 1:end + 1] = 2 # 标记I
entity_ids.append([start, end])
entity_labels.append(categories.index(label)+1)
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
entity_labels.append(0)
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_entity_ids.append(entity_ids)
batch_entity_labels.append(entity_labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
batch_entity_labels = torch.tensor(sequence_padding(batch_entity_labels), dtype=torch.long, device=device) # [btz, 实体个数]
return [batch_token_ids, batch_entity_ids], [batch_labels, batch_entity_labels]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.dense1 = nn.Linear(768, len(categories))
self.dense2 = nn.Linear(768, len(categories)+1) # 包含padding
self.crf = CRF(len(categories))
def forward(self, inputs):
# 一阶段的输出
token_ids, entity_ids = inputs[0], inputs[1]
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
# 二阶段输出
btz, entity_count, _ = entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
entity_ids = entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=entity_ids).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
return emission_score, attention_mask, entity_logit
def predict(self, token_ids):
self.eval()
with torch.no_grad():
# 一阶段推理
last_hidden_state = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.dense1(last_hidden_state) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
# 二阶段推理
batch_entity_ids = []
for one_samp in best_path:
entity_ids = []
for j, item in enumerate(one_samp):
if item.item() == 1: # B
entity_ids.append([j, j])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and (item.item() == 2): # I
entity_ids[-1][-1] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
if not entity_ids: # 至少要有一个标签
entity_ids.append([0, 0]) # 如果没有则用0填充
batch_entity_ids.append([i for i in entity_ids if i])
batch_entity_ids = torch.tensor(sequence_padding(batch_entity_ids), dtype=torch.long, device=device) # [btz, 实体个数,start/end]
btz, entity_count, _ = batch_entity_ids.shape
hidden_size = last_hidden_state.shape[-1]
gather_index = batch_entity_ids.reshape(btz, -1, 1).repeat(1, 1, hidden_size)
entity_states = torch.gather(last_hidden_state, dim=1, index=gather_index).reshape(btz, entity_count, -1, hidden_size)
entity_states = torch.mean(entity_states, dim=2) # 取实体首尾hidden_states的均值
entity_logit = self.dense2(entity_states) # [btz, 实体个数,实体类型数]
entity_pred = torch.argmax(entity_logit, dim=-1) # [btz, 实体个数]
# 每个元素为一个三元组
entity_tulpe = trans_entity2tuple(batch_entity_ids, entity_pred)
return best_path, entity_tulpe
model = Model().to(device)
class Loss(nn.Module):
def __init__(self) -> None:
super().__init__()
self.loss2 = nn.CrossEntropyLoss(ignore_index=0)
def forward(self, outputs, labels):
emission_score, attention_mask, entity_logit = outputs
seq_labels, entity_labels = labels
loss1 = model.crf(emission_score, attention_mask, seq_labels)
loss2 = self.loss2(entity_logit.reshape(-1, entity_logit.shape[-1]), entity_labels.flatten())
return {'loss': loss1+loss2, 'loss1': loss1, 'loss2': loss2}
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印loss1和loss2
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X1, Y1, Z1 = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, entity_ids), (label, entity_labels) in tqdm(data):
scores, entity_pred = model.predict(token_ids) # [btz, seq_len]
# 一阶段指标: token粒度
attention_mask = label.gt(0)
X1 += (scores.eq(label) * attention_mask).sum().item()
Y1 += scores.gt(0).sum().item()
Z1 += label.gt(0).sum().item()
# 二阶段指标:entity粒度
entity_true = trans_entity2tuple(entity_ids, entity_labels)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X1 / (Y1 + Z1), X1 / Y1, X1 / Z1
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(entity_ids, entity_labels):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
entity_true = set()
for i, one_sample in enumerate(entity_ids):
for j, item in enumerate(one_sample):
if item[0].item() * item[1].item() != 0:
entity_true.add((i, item[0].item(), item[1].item(), entity_labels[i, j].item()))
return entity_true
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-1阶段] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-2阶段] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 增加词性作为额外的embedding
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.30; entity_level: 96.09
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
import jieba.posseg as psg
from collections import Counter
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
psg_map = {v: i+1 for i, v in enumerate(['a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'mg', 'mq', 'n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 't',
'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg'])}
def collate_fn(batch):
batch_token_ids, batch_psg_ids, batch_labels = [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens) # 第i个token在原始text中的区间
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
# 处理词性输入
seg = [(i, p) for word, p in psg.cut(d[0]) for i in word]
seg_word, seg_p = zip(*seg)
psg_ids = np.zeros(len(token_ids))
for i, j in enumerate(mapping):
if j:
start, end = j[0], j[-1] # token在原始text的首尾位置
token_new = (''.join(seg_word[start:end+1])).lower()
assert tokens[i] == token_new, f"{tokens[i]} -> {token_new}"
if start == end:
psg_ids[i] = psg_map.get(seg_p[start], 0) # 不在字典里给0
else:
psg_ids[i] = psg_map.get(Counter(seg_p[start:end+1]).most_common(1)[0][0], 0) # 取众数
batch_psg_ids.append(psg_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_psg_ids = torch.tensor(sequence_padding(batch_psg_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return [batch_token_ids, batch_psg_ids], batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
layer_add_embs = nn.Embedding(len(psg_map)+1, 768)
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0,
layer_add_embs=layer_add_embs)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories))
def forward(self, token_ids, psg_ids):
sequence_output = self.bert([token_ids, psg_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [bts, seq_len, tag_size]
attention_mask = token_ids.gt(0)
return emission_score, attention_mask
def predict(self, token_ids, psg_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids, psg_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [bts, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for (token_ids, psg_ids), label in tqdm(data):
scores = model.predict(token_ids, psg_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 测试两种方案,一种是用数据集来生成crf权重,第二种是来初始化
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 初始化: [valid_f1] token_level: 97.35; entity_level: 96.42
# 固定化: [valid_f1] token_level: 96.92; entity_level: 95.42
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
maxlen = 256
batch_size = 16
categories = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
batch_token_ids.append(token_ids)
batch_labels.append(labels)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train')
valid_data = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev')
train_dataloader = DataLoader(ListDataset(data=train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(ListDataset(data=valid_data), batch_size=batch_size, collate_fn=collate_fn)
# 根据训练数据生成权重
transition = np.zeros((len(categories), len(categories)))
start_transition = np.zeros(len(categories))
end_transition = np.zeros(len(categories))
for d in tqdm(train_data, desc='Generate init_trasitions'):
tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros(len(token_ids))
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
labels[start] = categories_label2id['B-'+label]
labels[start + 1:end + 1] = categories_label2id['I-'+label]
for i in range(len(labels)-1):
transition[int(labels[i]), int(labels[i+1])] += 1
start_transition[int(labels[0])] += 1 # start转移到标签
end_transition[int(labels[-1])] += 1 # 标签转移到end
max_v = np.max([np.max(transition), np.max(start_transition), np.max(end_transition)])
min_v = np.min([np.min(transition), np.min(start_transition), np.min(end_transition)])
transition = (transition - min_v) / (max_v - min_v)
start_transition = (start_transition - min_v) / (max_v - min_v)
end_transition = (end_transition - min_v) / (max_v - min_v)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories))
self.crf = CRF(len(categories), init_transitions=[transition, start_transition, end_transition], freeze=True) # 控制是否初始化,是否参加训练
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.crf(*outputs, labels)
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 1e-10, 1e-10, 1e-10
X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
for token_ids, label in tqdm(data):
scores = model.predict(token_ids) # [btz, seq_len]
attention_mask = label.gt(0)
# token粒度
X += (scores.eq(label) * attention_mask).sum().item()
Y += scores.gt(0).sum().item()
Z += label.gt(0).sum().item()
# entity粒度
entity_pred = trans_entity2tuple(scores)
entity_true = trans_entity2tuple(label)
X2 += len(entity_pred.intersection(entity_true))
Y2 += len(entity_pred)
Z2 += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2/ Y2, X2 / Z2
return f1, precision, recall, f2, precision2, recall2
def trans_entity2tuple(scores):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids = set()
for i, one_samp in enumerate(scores):
entity_ids = []
for j, item in enumerate(one_samp):
flag_tag = categories_id2label[item.item()]
if flag_tag.startswith('B-'): # B
entity_ids.append([i, j, j, flag_tag[2:]])
elif len(entity_ids) == 0:
continue
elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:]==entity_ids[-1][-1]): # I
entity_ids[-1][-2] = j
elif len(entity_ids[-1]) > 0:
entity_ids.append([])
for i in entity_ids:
if i:
batch_entity_ids.add(tuple(i))
return batch_entity_ids
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
if f2 > self.best_val_f1:
self.best_val_f1 = f2
# model.save_weights('best_model.pt')
print(f'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
print(f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# efficient_global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 96.55
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import EfficientGlobalPointer
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = EfficientGlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0.5):
X, Y, Z, threshold = 1e-10, 1e-10, 1e-10, 0
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 95.66
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import GlobalPointer
import random
import os
maxlen = 256
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
ner_vocab_size = len(categories_label2id)
ner_head_size = 64
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
data.append((text, label)) # label为[[start, end, entity], ...]
return data
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for i, (text, text_labels) in enumerate(batch):
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
labels = np.zeros((len(categories_label2id), maxlen, maxlen))
for start, end, label in text_labels:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label = categories_label2id[label]
labels[label, start, end] = 1
batch_token_ids.append(token_ids) # 前面已经限制了长度
batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(sequence_padding(batch_labels, seq_dims=3), dtype=torch.long, device=device)
return batch_token_ids, batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.global_pointer = GlobalPointer(hidden_size=768, heads=ner_vocab_size, head_size=ner_head_size)
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
logit = self.global_pointer(sequence_output, token_ids.gt(0).long())
return logit
model = Model().to(device)
class MyLoss(MultilabelCategoricalCrossentropy):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, y_pred, y_true):
y_true = y_true.view(y_true.shape[0]*y_true.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
y_pred = y_pred.view(y_pred.shape[0]*y_pred.shape[1], -1) # [btz*ner_vocab_size, seq_len*seq_len]
return super().forward(y_pred, y_true)
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0):
X, Y, Z = 0, 1e-10, 1e-10
for x_true, label in data:
scores = model.predict(x_true)
for i, score in enumerate(scores):
R = set()
for l, start, end in zip(*np.where(score.cpu() > threshold)):
R.add((start, end, categories_id2label[l]))
T = set()
for l, start, end in zip(*np.where(label[i].cpu() > threshold)):
T.add((start, end, categories_id2label[l]))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# mrc阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 95.75
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
from collections import defaultdict
max_c_len = 224
max_q_len = 32
batch_size = 6 # 真实的batch_size是 batch_size * 实体类型数
categories = ['LOC', 'PER', 'ORG']
ent2query = {"LOC": "找出下述句子中的地址名",
"PER": "找出下述句子中的人名",
"ORG": "找出下述句子中的机构名"}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_start_labels, batch_end_labels = [], [], [], []
batch_ent_type = []
for d in batch:
tokens_b = tokenizer.tokenize(d[0], maxlen=max_c_len)[1:] # 不保留[CLS]
mapping = tokenizer.rematch(d[0], tokens_b)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
# 按照实体类型整理实体
label_dict = defaultdict(list)
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
label_dict[label].append((start, end))
# 遍历实体类型,query为tokens_a, context为tokens_b
# 样本组成:[CLS] + tokens_a + [SEP] + tokens_b + [SEP]
for _type in categories:
start_ids = [0] * len(tokens_b)
end_ids = [0] * len(tokens_b)
text_a = ent2query[_type]
tokens_a = tokenizer.tokenize(text_a, maxlen=max_q_len)
for _label in label_dict[_type]:
start_ids[_label[0]] = 1
end_ids[_label[1]] = 1
start_ids = [0] * len(tokens_a) + start_ids
end_ids = [0] * len(tokens_a) + end_ids
token_ids = tokenizer.tokens_to_ids(tokens_a) + tokenizer.tokens_to_ids(tokens_b)
segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b)
assert len(start_ids) == len(end_ids) == len(token_ids) == len(segment_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_start_labels.append(start_ids)
batch_end_labels.append(end_ids)
batch_ent_type.append(_type)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_start_labels = torch.tensor(sequence_padding(batch_start_labels), dtype=torch.long, device=device)
batch_end_labels = torch.tensor(sequence_padding(batch_end_labels), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_segment_ids, batch_start_labels, batch_end_labels, batch_ent_type]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path)
self.mid_linear = nn.Sequential(
nn.Linear(768, 128),
nn.ReLU(),
nn.Dropout(0.1)
)
self.start_fc = nn.Linear(128, 2)
self.end_fc = nn.Linear(128, 2)
def forward(self, token_ids, segment_ids):
sequence_output = self.bert([token_ids, segment_ids]) # [bts, seq_len, hdsz]
seq_out = self.mid_linear(sequence_output) # [bts, seq_len, mid_dims]
start_logits = self.start_fc(seq_out) # [bts, seq_len, 2]
end_logits = self.end_fc(seq_out) # [bts, seq_len, 2]
return start_logits, end_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
start_logits, end_logits = outputs
mask, start_ids, end_ids = labels[:3]
start_logits = start_logits.view(-1, 2)
end_logits = end_logits.view(-1, 2)
# 去掉 text_a 和 padding 部分的标签,计算真实 loss
active_loss = mask.view(-1) == 1
active_start_logits = start_logits[active_loss]
active_end_logits = end_logits[active_loss]
active_start_labels = start_ids.view(-1)[active_loss]
active_end_labels = end_ids.view(-1)[active_loss]
start_loss = super().forward(active_start_logits, active_start_labels)
end_loss = super().forward(active_end_logits, active_end_labels)
return start_loss + end_loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 0, 1e-10, 1e-10
for (token_ids, segment_ids), labels in tqdm(data, desc='Evaluation'):
start_logit, end_logit = model.predict([token_ids, segment_ids]) # [btz, seq_len, 2]
mask, start_ids, end_ids, ent_type = labels
# entity粒度
entity_pred = mrc_decode(start_logit, end_logit, ent_type, mask)
entity_true = mrc_decode(start_ids, end_ids, ent_type)
X += len(entity_pred.intersection(entity_true))
Y += len(entity_pred)
Z += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X/ Y, X / Z
return f1, precision, recall
# 严格解码 baseline
def mrc_decode(start_preds, end_preds, ent_type, mask=None):
'''返回实体的start, end
'''
predict_entities = set()
if mask is not None: # 预测的把query和padding部分mask掉
start_preds = torch.argmax(start_preds, -1) * mask
end_preds = torch.argmax(end_preds, -1) * mask
start_preds = start_preds.cpu().numpy()
end_preds = end_preds.cpu().numpy()
for bt_i in range(start_preds.shape[0]):
start_pred = start_preds[bt_i]
end_pred = end_preds[bt_i]
# 统计每个样本的结果
for i, s_type in enumerate(start_pred):
if s_type == 0:
continue
for j, e_type in enumerate(end_pred[i:]):
if s_type == e_type:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities.add((bt_i, i, i+j, ent_type[bt_i]))
break
return predict_entities
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# span阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 96.31
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.losses import FocalLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm
max_len = 256
batch_size = 16
categories = ['LOC', 'PER', 'ORG']
categories_id2label = {i: k for i, k in enumerate(categories, start=1)}
categories_label2id = {k: i for i, k in enumerate(categories, start=1)}
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_start_labels, batch_end_labels = [], [], []
for d in batch:
tokens = tokenizer.tokenize(d[0], maxlen=max_len)[1:] # 不保留[CLS]
mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
token_ids = tokenizer.tokens_to_ids(tokens)
start_ids = [0] * len(tokens)
end_ids = [0] * len(tokens)
for start, end, label in d[1:]:
if start in start_mapping and end in end_mapping:
start = start_mapping[start]
end = end_mapping[end]
start_ids[start] = categories_label2id[label]
end_ids[end] = categories_label2id[label]
batch_token_ids.append(token_ids)
batch_start_labels.append(start_ids)
batch_end_labels.append(end_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_start_labels = torch.tensor(sequence_padding(batch_start_labels), dtype=torch.long, device=device)
batch_end_labels = torch.tensor(sequence_padding(batch_end_labels), dtype=torch.long, device=device)
batch_mask = batch_token_ids.gt(0).long()
return [batch_token_ids], [batch_mask, batch_start_labels, batch_end_labels]
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.mid_linear = nn.Sequential(
nn.Linear(768, 128),
nn.ReLU(),
nn.Dropout(0.1)
)
self.start_fc = nn.Linear(128, len(categories)+1) # 0表示没有
self.end_fc = nn.Linear(128, len(categories)+1)
def forward(self, token_ids):
sequence_output = self.bert(token_ids) # [bts, seq_len, hdsz]
seq_out = self.mid_linear(sequence_output) # [bts, seq_len, mid_dims]
start_logits = self.start_fc(seq_out) # [bts, seq_len, num_tags]
end_logits = self.end_fc(seq_out) # [bts, seq_len, num_tags]
return start_logits, end_logits
model = Model().to(device)
class Loss(nn.CrossEntropyLoss):
def forward(self, outputs, labels):
start_logits, end_logits = outputs
mask, start_ids, end_ids = labels
start_logits = start_logits.view(-1, len(categories)+1)
end_logits = end_logits.view(-1, len(categories)+1)
# 去掉padding部分的标签,计算真实 loss
active_loss = mask.view(-1) == 1
active_start_logits = start_logits[active_loss]
active_end_logits = end_logits[active_loss]
active_start_labels = start_ids.view(-1)[active_loss]
active_end_labels = end_ids.view(-1)[active_loss]
start_loss = super().forward(active_start_logits, active_start_labels)
end_loss = super().forward(active_end_logits, active_end_labels)
return start_loss + end_loss
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data):
X, Y, Z = 0, 1e-10, 1e-10
for token_ids, labels in tqdm(data, desc='Evaluation'):
start_logit, end_logit = model.predict(token_ids) # [btz, seq_len, 2]
mask, start_ids, end_ids = labels
# entity粒度
entity_pred = span_decode(start_logit, end_logit, mask)
entity_true = span_decode(start_ids, end_ids)
X += len(entity_pred.intersection(entity_true))
Y += len(entity_pred)
Z += len(entity_true)
f1, precision, recall = 2 * X / (Y + Z), X/ Y, X / Z
return f1, precision, recall
# 严格解码 baseline
def span_decode(start_preds, end_preds, mask=None):
'''返回实体的start, end
'''
predict_entities = set()
if mask is not None: # 把padding部分mask掉
start_preds = torch.argmax(start_preds, -1) * mask
end_preds = torch.argmax(end_preds, -1) * mask
start_preds = start_preds.cpu().numpy()
end_preds = end_preds.cpu().numpy()
for bt_i in range(start_preds.shape[0]):
start_pred = start_preds[bt_i]
end_pred = end_preds[bt_i]
# 统计每个样本的结果
for i, s_type in enumerate(start_pred):
if s_type == 0:
continue
for j, e_type in enumerate(end_pred[i:]):
if s_type == e_type:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities.add((bt_i, i, i+j, categories_id2label[s_type]))
break
return predict_entities
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# tplinker_plus用来做实体识别
# [valid_f1]: 95.71
import numpy as np
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import MultilabelCategoricalCrossentropy
from bert4torch.layers import TplinkerHandshakingKernel
maxlen = 64
batch_size = 16
categories_label2id = {"LOC": 0, "ORG": 1, "PER": 2}
categories_id2label = dict((value, key) for key,value in categories_label2id.items())
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
data = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
text, label = '', []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
text += char
if flag[0] == 'B':
label.append([i, i, flag[2:]])
elif flag[0] == 'I':
label[-1][1] = i
text_list = tokenizer.tokenize(text)[1:-1] #不保留首位[CLS]和末位[SEP]
tokens = [j for i in text_list for j in i][:maxlen] # 以char为单位
data.append((tokens, label)) # label为[[start, end, entity], ...]
return data
def trans_ij2k(seq_len, i, j):
'''把第i行,第j列转化成上三角flat后的序号
'''
if (i > seq_len - 1) or (j > seq_len - 1) or (i > j):
return 0
return int(0.5*(2*seq_len-i+1)*i+(j-i))
map_ij2k = {(i, j): trans_ij2k(maxlen, i, j) for i in range(maxlen) for j in range(maxlen) if j >= i}
map_k2ij = {v: k for k, v in map_ij2k.items()}
def tran_ent_rel2id():
'''获取最后一个分类层的的映射关系
'''
tag2id = {}
for p in categories_label2id.keys():
tag2id[p] = len(tag2id)
return tag2id
tag2id = tran_ent_rel2id()
id2tag = {v: k for k, v in tag2id.items()}
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
pair_len = maxlen * (maxlen+1)//2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels = torch.zeros((len(batch), pair_len, len(tag2id)), dtype=torch.long, device=device)
batch_token_ids = []
for i, (tokens, labels) in enumerate(batch):
batch_token_ids.append(tokenizer.tokens_to_ids(tokens)) # 前面已经限制了长度
for s_i in labels:
if s_i[1] >= len(tokens): # 实体的结尾超过文本长度,则不标记
continue
batch_labels[i, map_ij2k[s_i[0], s_i[1]], tag2id[s_i[2]]] = 1
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids, length=maxlen), dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(tag2id))
self.handshaking_kernel = TplinkerHandshakingKernel(768, shaking_type='cln_plus', inner_enc_type='lstm')
def forward(self, inputs):
last_hidden_state = self.bert(inputs) # [btz, seq_len, hdsz]
shaking_hiddens = self.handshaking_kernel(last_hidden_state)
output = self.fc(shaking_hiddens) # [btz, pair_len, tag_size]
return output
model = Model().to(device)
model.compile(loss=MultilabelCategoricalCrossentropy(), optimizer=optim.Adam(model.parameters(), lr=2e-5))
def evaluate(data, threshold=0):
X, Y, Z, threshold = 0, 1e-10, 1e-10, 0
for x_true, label in data:
scores = model.predict(x_true) # [btz, pair_len, tag_size]
for i, score in enumerate(scores):
R = set()
for pair_id, tag_id in zip(*np.where(score.cpu().numpy() > threshold)):
start, end = map_k2ij[pair_id][0], map_k2ij[pair_id][1]
R.add((start, end, tag_id))
T = set()
for pair_id, tag_id in zip(*np.where(label[i].cpu().numpy() > threshold)):
start, end = map_k2ij[pair_id][0], map_k2ij[pair_id][1]
T.add((start, end, tag_id))
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f} best_f1: {self.best_val_f1:.5f}')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
import argparse
import collections
import json
import os
import pickle
import torch
import logging
import shutil
from tqdm import tqdm
import time
logger = logging.Logger('log')
def get_path_from_url(url, root_dir, check_exist=True, decompress=True):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url and decompress it, return the path.
Args:
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME or DATASET_HOME
decompress (bool): decompress zip or tar file. Default is `True`
Returns:
str: a local path to save downloaded models & weights & datasets.
"""
import os.path
import os
import tarfile
import zipfile
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') or path.startswith('https://')
def _map_path(url, root_dir):
# parse path after download under root_dir
fname = os.path.split(url)[-1]
fpath = fname
return os.path.join(root_dir, fpath)
def _get_download(url, fullname):
import requests
# using requests.get method
fname = os.path.basename(fullname)
try:
req = requests.get(url, stream=True)
except Exception as e: # requests.exceptions.ConnectionError
logger.info("Downloading {} from {} failed with exception {}".format(
fname, url, str(e)))
return False
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
for chunk in req.iter_content(chunk_size=1024):
f.write(chunk)
pbar.update(1)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _download(url, path):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.split(url)[-1]
fullname = os.path.join(path, fname)
retry_cnt = 0
logger.info("Downloading {} from {}".format(fname, url))
DOWNLOAD_RETRY_LIMIT = 3
while not os.path.exists(fullname):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
if not _get_download(url, fullname):
time.sleep(1)
continue
return fullname
def _uncompress_file_zip(filepath):
with zipfile.ZipFile(filepath, 'r') as files:
file_list = files.namelist()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
# `strip(os.sep)` to remove `os.sep` in the tail of path
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _is_a_single_file(file_list):
if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
return True
return False
def _is_a_single_dir(file_list):
new_file_list = []
for file_path in file_list:
if '/' in file_path:
file_path = file_path.replace('/', os.sep)
elif '\\' in file_path:
file_path = file_path.replace('\\', os.sep)
new_file_list.append(file_path)
file_name = new_file_list[0].split(os.sep)[0]
for i in range(1, len(new_file_list)):
if file_name != new_file_list[i].split(os.sep)[0]:
return False
return True
def _uncompress_file_tar(filepath, mode="r:*"):
with tarfile.open(filepath, mode) as files:
file_list = files.getnames()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _decompress(fname):
"""
Decompress for zip and tar file
"""
logger.info("Decompressing {}...".format(fname))
# For protecting decompressing interupted,
# decompress to fpath_tmp directory firstly, if decompress
# successed, move decompress files to fpath and delete
# fpath_tmp and remove download compress file.
if tarfile.is_tarfile(fname):
uncompressed_path = _uncompress_file_tar(fname)
elif zipfile.is_zipfile(fname):
uncompressed_path = _uncompress_file_zip(fname)
else:
raise TypeError("Unsupport compress file type {}".format(fname))
return uncompressed_path
assert is_url(url), "downloading from {} not a url".format(url)
fullpath = _map_path(url, root_dir)
if os.path.exists(fullpath) and check_exist:
logger.info("Found {}".format(fullpath))
else:
fullpath = _download(url, root_dir)
if decompress and (tarfile.is_tarfile(fullpath) or
zipfile.is_zipfile(fullpath)):
fullpath = _decompress(fullpath)
return fullpath
MODEL_MAP = {
"uie-base": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
}
},
"uie-medium": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-mini": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-micro": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-nano": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano_v1.0/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-medical-base": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medical_base_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
}
},
"uie-tiny": {
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny_v0.1/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json",
"vocab_file":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/vocab.txt",
"special_tokens_map":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/special_tokens_map.json",
"tokenizer_config":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/tokenizer_config.json"
}
}
}
def build_params_map(attention_num=12):
"""
build params map from paddle-paddle's ERNIE to transformer's BERT
:return:
"""
weight_map = collections.OrderedDict({
'encoder.embeddings.word_embeddings.weight': "bert.embeddings.word_embeddings.weight",
'encoder.embeddings.position_embeddings.weight': "bert.embeddings.position_embeddings.weight",
'encoder.embeddings.token_type_embeddings.weight': "bert.embeddings.token_type_embeddings.weight",
'encoder.embeddings.task_type_embeddings.weight': "embeddings.task_type_embeddings.weight", # 这里没有前缀bert,直接映射到bert4torch结构
'encoder.embeddings.layer_norm.weight': 'bert.embeddings.LayerNorm.weight',
'encoder.embeddings.layer_norm.bias': 'bert.embeddings.LayerNorm.bias',
})
# add attention layers
for i in range(attention_num):
weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.query.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.q_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.query.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.key.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.k_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.key.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.weight'] = f'bert.encoder.layer.{i}.attention.self.value.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.v_proj.bias'] = f'bert.encoder.layer.{i}.attention.self.value.bias'
weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.weight'] = f'bert.encoder.layer.{i}.attention.output.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.self_attn.out_proj.bias'] = f'bert.encoder.layer.{i}.attention.output.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.norm1.weight'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.weight'
weight_map[f'encoder.encoder.layers.{i}.norm1.bias'] = f'bert.encoder.layer.{i}.attention.output.LayerNorm.bias'
weight_map[f'encoder.encoder.layers.{i}.linear1.weight'] = f'bert.encoder.layer.{i}.intermediate.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.linear1.bias'] = f'bert.encoder.layer.{i}.intermediate.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.linear2.weight'] = f'bert.encoder.layer.{i}.output.dense.weight'
weight_map[f'encoder.encoder.layers.{i}.linear2.bias'] = f'bert.encoder.layer.{i}.output.dense.bias'
weight_map[f'encoder.encoder.layers.{i}.norm2.weight'] = f'bert.encoder.layer.{i}.output.LayerNorm.weight'
weight_map[f'encoder.encoder.layers.{i}.norm2.bias'] = f'bert.encoder.layer.{i}.output.LayerNorm.bias'
# add pooler
weight_map.update(
{
'encoder.pooler.dense.weight': 'bert.pooler.dense.weight',
'encoder.pooler.dense.bias': 'bert.pooler.dense.bias',
'linear_start.weight': 'linear_start.weight',
'linear_start.bias': 'linear_start.bias',
'linear_end.weight': 'linear_end.weight',
'linear_end.bias': 'linear_end.bias',
}
)
return weight_map
def extract_and_convert(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info('=' * 20 + 'save config file' + '=' * 20)
config = json.load(open(os.path.join(input_dir, 'model_config.json'), 'rt', encoding='utf-8'))
config = config['init_args'][0]
config["architectures"] = ["UIE"]
config['layer_norm_eps'] = 1e-12
del config['init_class']
if 'sent_type_vocab_size' in config:
config['type_vocab_size'] = config['sent_type_vocab_size']
config['intermediate_size'] = 4 * config['hidden_size']
json.dump(config, open(os.path.join(output_dir, 'config.json'),
'wt', encoding='utf-8'), indent=4)
logger.info('=' * 20 + 'save vocab file' + '=' * 20)
with open(os.path.join(input_dir, 'vocab.txt'), 'rt', encoding='utf-8') as f:
words = f.read().splitlines()
words_set = set()
words_duplicate_indices = []
for i in range(len(words)-1, -1, -1):
word = words[i]
if word in words_set:
words_duplicate_indices.append(i)
words_set.add(word)
for i, idx in enumerate(words_duplicate_indices):
words[idx] = chr(0x1F6A9+i) # Change duplicated word to 🚩 LOL
with open(os.path.join(output_dir, 'vocab.txt'), 'wt', encoding='utf-8') as f:
for word in words:
f.write(word+'\n')
special_tokens_map = {
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]"
}
json.dump(special_tokens_map, open(os.path.join(output_dir, 'special_tokens_map.json'),
'wt', encoding='utf-8'))
tokenizer_config = {
"do_lower_case": True,
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"tokenizer_class": "BertTokenizer"
}
json.dump(tokenizer_config, open(os.path.join(output_dir, 'tokenizer_config.json'),
'wt', encoding='utf-8'))
logger.info('=' * 20 + 'extract weights' + '=' * 20)
state_dict = collections.OrderedDict()
weight_map = build_params_map(attention_num=config['num_hidden_layers'])
paddle_paddle_params = pickle.load(
open(os.path.join(input_dir, 'model_state.pdparams'), 'rb'))
del paddle_paddle_params['StructuredToParameterName@@']
for weight_name, weight_value in paddle_paddle_params.items():
if 'weight' in weight_name:
if 'encoder.encoder' in weight_name or 'pooler' in weight_name or 'linear' in weight_name:
weight_value = weight_value.transpose()
# Fix: embedding error
if 'word_embeddings.weight' in weight_name:
weight_value[0, :] = 0
if weight_name not in weight_map:
logger.info(f"{'='*20} [SKIP] {weight_name} {'='*20}")
continue
state_dict[weight_map[weight_name]] = torch.FloatTensor(weight_value)
logger.info(f"{weight_name} -> {weight_map[weight_name]} {weight_value.shape}")
torch.save(state_dict, os.path.join(output_dir, "pytorch_model.bin"))
def check_model(input_model):
if not os.path.exists(input_model):
if input_model not in MODEL_MAP:
raise ValueError('input_model not exists!')
resource_file_urls = MODEL_MAP[input_model]['resource_file_urls']
logger.info("Downloading resource files...")
for key, val in resource_file_urls.items():
file_path = os.path.join(input_model, key)
if not os.path.exists(file_path):
get_path_from_url(val, input_model)
def do_main():
check_model(args.input_model)
extract_and_convert(args.input_model, args.output_model)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_model", default="uie-base", type=str,
help="Directory of input paddle model.\n Will auto download model [uie-base/uie-tiny]")
parser.add_argument("-o", "--output_model", default="uie_base_pytorch", type=str,
help="Directory of output pytorch model")
args = parser.parse_args()
do_main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment