Commit 0fc002df authored by huchen's avatar huchen
Browse files

init the dlexamples new

parent 0e04b692
#! -*- coding:utf-8 -*-
# 文本分类例子下的模型压缩
# 方法为BERT-of-Theseus
# 论文:https://arxiv.org/abs/2002.02925
# 博客:https://kexue.fm/archives/7575
import json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Input, Lambda, Dense, Layer
from keras.models import Model
num_classes = 119
maxlen = 128
batch_size = 32
# BERT base
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename) as f:
for i, l in enumerate(f):
l = json.loads(l)
text, label = l['sentence'], l['label']
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data(
'/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
)
valid_data = load_data(
'/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
class BinaryRandomChoice(Layer):
"""随机二选一
"""
def __init__(self, **kwargs):
super(BinaryRandomChoice, self).__init__(**kwargs)
self.supports_masking = True
def compute_mask(self, inputs, mask=None):
if mask is not None:
return mask[1]
def call(self, inputs):
source, target = inputs
mask = K.random_binomial(shape=[1], p=0.5)
output = mask * source + (1 - mask) * target
return K.in_train_phase(output, target)
def compute_output_shape(self, input_shape):
return input_shape[1]
def bert_of_theseus(predecessor, successor, classfier):
"""bert of theseus
"""
inputs = predecessor.inputs
# 固定住已经训练好的层
for layer in predecessor.model.layers:
layer.trainable = False
classfier.trainable = False
# Embedding层替换
predecessor_outputs = predecessor.apply_embeddings(inputs)
successor_outputs = successor.apply_embeddings(inputs)
outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs])
# Transformer层替换
layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers
for index in range(successor.num_hidden_layers):
predecessor_outputs = outputs
for sub_index in range(layers_per_module):
predecessor_outputs = predecessor.apply_main_layers(
predecessor_outputs, layers_per_module * index + sub_index
)
successor_outputs = successor.apply_main_layers(outputs, index)
outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs])
# 返回模型
outputs = classfier(outputs)
model = Model(inputs, outputs)
return model
def evaluate(data, model):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self, savename):
self.best_val_acc = 0.
self.savename = savename
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(valid_generator, self.model)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
self.model.save_weights(self.savename)
print(
u'val_acc: %.5f, best_val_acc: %.5f\n' %
(val_acc, self.best_val_acc)
)
# 加载预训练模型(12层)
predecessor = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
return_keras_model=False,
prefix='Predecessor-'
)
# 加载预训练模型(3层)
successor = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
return_keras_model=False,
num_hidden_layers=3,
prefix='Successor-'
)
# 判别模型
x_in = Input(shape=K.int_shape(predecessor.output)[1:])
x = Lambda(lambda x: x[:, 0])(x_in)
x = Dense(units=num_classes, activation='softmax')(x)
classfier = Model(x_in, x)
predecessor_model = Model(predecessor.inputs, classfier(predecessor.output))
predecessor_model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(2e-5), # 用足够小的学习率
metrics=['sparse_categorical_accuracy'],
)
predecessor_model.summary()
successor_model = Model(successor.inputs, classfier(successor.output))
successor_model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(2e-5), # 用足够小的学习率
metrics=['sparse_categorical_accuracy'],
)
successor_model.summary()
theseus_model = bert_of_theseus(predecessor, successor, classfier)
theseus_model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(2e-5), # 用足够小的学习率
metrics=['sparse_categorical_accuracy'],
)
theseus_model.summary()
if __name__ == '__main__':
# 训练predecessor
predecessor_evaluator = Evaluator('best_predecessor.weights')
predecessor_model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=5,
callbacks=[predecessor_evaluator]
)
# 训练theseus
theseus_evaluator = Evaluator('best_theseus.weights')
theseus_model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=10,
callbacks=[theseus_evaluator]
)
theseus_model.load_weights('best_theseus.weights')
# 训练successor
successor_evaluator = Evaluator('best_successor.weights')
successor_model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=5,
callbacks=[successor_evaluator]
)
#! -*- coding:utf-8 -*-
# 通过梯度惩罚增强模型的泛化性能
# 比CLUE榜单公开的同数据集上的BERT base的成绩高2%
# 数据集:IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)
# 博客:https://kexue.fm/archives/7234
# 适用于Keras 2.3.1
import json
import numpy as np
from bert4keras.backend import keras, search_layer, K
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from keras.layers import Lambda, Dense
from tqdm import tqdm
num_classes = 119
maxlen = 128
batch_size = 32
# BERT base
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename) as f:
for i, l in enumerate(f):
l = json.loads(l)
text, label = l['sentence'], l['label']
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data(
'/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
)
valid_data = load_data(
'/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0])(bert.model.output)
output = Dense(
units=num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
model.summary()
def sparse_categorical_crossentropy(y_true, y_pred):
"""自定义稀疏交叉熵
这主要是因为keras自带的sparse_categorical_crossentropy不支持求二阶梯度。
"""
y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
y_true = K.cast(y_true, 'int32')
y_true = K.one_hot(y_true, K.shape(y_pred)[-1])
return K.categorical_crossentropy(y_true, y_pred)
def loss_with_gradient_penalty(y_true, y_pred, epsilon=1):
"""带梯度惩罚的loss
"""
loss = K.mean(sparse_categorical_crossentropy(y_true, y_pred))
embeddings = search_layer(y_pred, 'Embedding-Token').embeddings
gp = K.sum(K.gradients(loss, [embeddings])[0].values**2)
return loss + 0.5 * epsilon * gp
model.compile(
loss=loss_with_gradient_penalty,
optimizer=Adam(2e-5),
metrics=['sparse_categorical_accuracy'],
)
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(valid_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
print(
u'val_acc: %.5f, best_val_acc: %.5f\n' %
(val_acc, self.best_val_acc)
)
def predict_to_file(in_file, out_file):
"""输出预测结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw = open(out_file, 'w')
with open(in_file) as fr:
for l in tqdm(fr):
l = json.loads(l)
text = l['sentence']
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
label = model.predict([[token_ids], [segment_ids]])[0].argmax()
l = json.dumps({'id': str(l['id']), 'label': str(label)})
fw.write(l + '\n')
fw.close()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=50,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.weights')
# predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
#! -*- coding: utf-8 -*-
# bert做image caption任务,coco数据集
# 通过Conditional Layer Normalization融入条件信息
# 请参考:https://kexue.fm/archives/7124
from __future__ import print_function
import json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, is_string
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
import cv2
# 模型配置
maxlen = 64
batch_size = 32
steps_per_epoch = 1000
epochs = 10000
# bert配置
config_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/uncased_L-12_H-768_A-12/vocab.txt'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
def read_caption(f):
"""读取并整理COCO的Caption数据
"""
data = json.load(open(f))
images = {}
for img in data['images']:
images[img['id']] = {
'image_id': img['file_name'],
'caption': [],
'url': img['coco_url']
}
for caption in data['annotations']:
images[caption['image_id']]['caption'].append(caption['caption'])
return list(images.values())
def read_image(f):
"""单图读取函数(对非方形的图片进行白色填充,使其变为方形)
"""
img = cv2.imread(f)
height, width = img.shape[:2]
if height > width:
height, width = img_size, width * img_size // height
img = cv2.resize(img, (width, height))
delta = (height - width) // 2
img = cv2.copyMakeBorder(
img,
top=0,
bottom=0,
left=delta,
right=height - width - delta,
borderType=cv2.BORDER_CONSTANT,
value=[255, 255, 255]
)
else:
height, width = height * img_size // width, img_size
img = cv2.resize(img, (width, height))
delta = (width - height) // 2
img = cv2.copyMakeBorder(
img,
top=delta,
bottom=width - height - delta,
left=0,
right=0,
borderType=cv2.BORDER_CONSTANT,
value=[255, 255, 255]
)
img = img.astype('float32')
return img[..., ::-1] # cv2的读取模式为BGR,但keras的模型要求为RGB
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_images, batch_token_ids, batch_segment_ids = [], [], []
for is_end, D in self.sample(random):
img = '/root/caption/coco/train2014/%s' % D['image_id']
caption = np.random.choice(D['caption'])
token_ids, segment_ids = tokenizer.encode(caption, maxlen=maxlen)
batch_images.append(read_image(img))
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_images = np.array(batch_images)
batch_images = preprocess_input(batch_images)
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids, batch_images], None
batch_images, batch_token_ids, batch_segment_ids = [], [], []
# 加载数据
train_data = read_caption(
'/root/caption/coco/annotations/captions_train2014.json'
)
valid_data = read_caption(
'/root/caption/coco/annotations/captions_val2014.json'
)
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉padding部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
if mask[1] is None:
y_mask = 1.0
else:
y_mask = K.cast(mask[1], K.floatx())[:, 1:]
y_true = y_true[:, 1:] # 目标token_ids
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
# 图像模型
MobileNetV2 = keras.applications.mobilenet_v2.MobileNetV2
preprocess_input = keras.applications.mobilenet_v2.preprocess_input
image_model = MobileNetV2(include_top=False, pooling='avg')
img_size = 299
# Bert模型
model = build_transformer_model(
config_path,
checkpoint_path,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
layer_norm_cond=image_model.output,
layer_norm_cond_hidden_size=128,
layer_norm_cond_hidden_act='swish',
additional_input_layers=image_model.input,
)
output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class AutoCaption(AutoRegressiveDecoder):
"""img2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
image = inputs[0]
token_ids = output_ids
segment_ids = np.zeros_like(token_ids)
return self.last_token(model).predict([token_ids, segment_ids, image])
def generate(self, image, topk=1):
if is_string(image):
image = read_image(image)
image = preprocess_input(image)
output_ids = self.beam_search([image], topk=topk) # 基于beam search
return tokenizer.decode(output_ids)
autocaption = AutoCaption(
start_id=tokenizer._token_start_id,
end_id=tokenizer._token_end_id,
maxlen=maxlen
)
def just_show():
samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2)]
for D in samples:
img = '/root/caption/coco/val2014/%s' % D['image_id']
print(u'image_id:', D['image_id'])
print(u'url:', D['url'])
print(u'predict:', autocaption.generate(img))
print(u'references:', D['caption'])
print()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
"""
image_id: COCO_val2014_000000524611.jpg
url: http://images.cocodataset.org/val2014/COCO_val2014_000000524611.jpg
predict: a train that is sitting on the tracks.
references: [u'A train carrying chemical tanks traveling past a water tower.', u'Dual train tracks with a train on one of them and a water tower in the background.', u'a train some trees and a water tower ', u'Train on tracks with water tower for Davis Junction in the rear.', u'A train on a train track going through a bunch of trees.']
image_id: COCO_val2014_000000202923.jpg
url: http://images.cocodataset.org/val2014/COCO_val2014_000000202923.jpg
predict: a baseball game in progress with the batter up to plate.
references: [u'Batter, catcher, and umpire anticipating the next pitch.', u'A baseball player holding a baseball bat in the game.', u'A baseball player stands ready at the plate.', u'Baseball players on the field ready for the pitch.', u'A view from behind a mesh fence of a baseball game.']
"""
#! -*- coding: utf-8 -*-
# bert做language model任务,小说生成
from __future__ import print_function
import glob, re
import numpy as np
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
maxlen = 256
batch_size = 16
steps_per_epoch = 1000
epochs = 10000
# bert配置
config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
novels = []
for txt in glob.glob('/root/金庸/*/*.txt'):
txt = open(txt, encoding='gbk').read()
txt = txt.replace('\r', '').replace('\n', '')
txt = txt.replace(u'整理制作,并提供下载', '')
txt = re.sub(u'www.*?com', '', txt)
txt = txt.replace(u'\u3000', ' ')
sents = []
for t in txt.split(' '):
for s in re.findall(u'.*?。', t):
if len(s) <= maxlen - 2:
sents.append(s)
novels.append(sents)
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
data = []
pbar = tqdm(desc=u'构建语料中', total=sum(len(n) for n in novels))
for novel in novels:
s = u''
for i in range(len(novel)):
for j in range(len(novel) - i):
if len(s) + len(novel[i + j]) > maxlen - 2:
data.append(s)
s = u''
break
else:
s += novel[i + j]
pbar.update(1)
if i + j >= len(novel):
break
if s:
data.append(s)
pbar.close()
np.random.shuffle(data)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
for is_end, text in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉padding部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
if mask[1] is None:
y_mask = 1.0
else:
y_mask = K.cast(mask[1], K.floatx())[:, 1:]
y_true = y_true[:, 1:] # 目标token_ids
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class StoryCompletion(AutoRegressiveDecoder):
"""基于随机采样的故事续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids = inputs[0]
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.zeros_like(token_ids)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, text, n=1, topp=0.95):
token_ids, _ = tokenizer.encode(text)
results = self.random_sample([token_ids[:-1]], n, topp=topp) # 基于随机采样
return [text + tokenizer.decode(ids) for ids in results]
story_completion = StoryCompletion(
start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen
)
def just_show():
s1 = u'当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。'
s2 = u'虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。'
s3 = u'杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。'
for s in [s1, s2, s3]:
t = story_completion.generate(s)
print(u'输入: %s' % s)
print(u'结果: %s\n' % ('\n'.join(t)))
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
"""
效果:
输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。
结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。次日清晨,张无忌便和赵敏去买了一匹高头大马,自己骑了随伴。那马甚有神骏,三十六斤重的身躯之中,竟无一头白马。他心中怦怦乱跳,暗想:若能将赵敏引出迷城,我决不致再和她相会,但若和赵姑娘相遇,我一生一世决计再难相见。何况我是她的私生女儿,这般亲热,岂不是好?我如何能和她相见?今后我要教训教训她才好?我教教她,教训她,要她心里快快活活的。他心如刀割,当即回到客店,将张无忌的所在说了。
输入: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。
结果: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。那矮子见他如此功力,大吃一惊,叫道:什么人?是谁?你干什么?我师父是谁?你们是谁?是谁?你们是谁?我师父是谁?你这矮子,便是段延庆。你们不知道我师父便是,是不是?快快说来。那矮子道:我师父便是延庆太子,他的徒弟也是段延庆。他老人家在唐朝做镇南王,你们便将他改名为延庆太子,叫做延庆太子!这名头倒怪,你们大伙儿听见了,也不知道他老人家是死是活。
输入: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。
结果: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。这时见他手中所握,竟是一柄特制的短剑,心中大喜,叫道::原来是金蛇郎君的剑!原来你便是金蛇郎君的弟子,这一下可要叫我失望了。那人哈哈一笑,说道:好啊!好啊,好啊!我的金蛇剑是我的,不过我是你的。这人道:我姓杨名过,名字叫过。你是我儿子,是我女儿,是不是?你这么大的年纪,怎地自称金刀驸马?我这就给你取个名字,叫作过儿。
"""
#! -*- coding: utf-8 -*-
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 数据:https://github.com/bojone/gpt_cchess
# 模型训练可以在python2/python3进行。但是cchess模块只支持python3,
# 因此如果需要交互式体验模型棋力,那么需要在python3下进行。
import json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator
from keras.models import Model
from cchess import *
# 基本信息
maxlen = 512
steps_per_epoch = 1000
epochs = 10000
batch_size = 16
# bert配置
config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""读取全局棋谱
返回:[(棋谱, 结果)],其中结果等于2为红方赢棋,1为和棋,
0为黑方赢棋,-1则为无明确标注胜负。
"""
D = []
with open(filename) as f:
for l in f:
l = json.loads(l)
if not l['fen']:
result = int(l['items'].get(u'棋局结果', -1))
D.append((l['iccs'], result))
return D
# 加载数据
data = load_data('/root/qipu.json')
# 建立分词器
chars = [u'[PAD]'] + list(u'0123456789abcdefghi')
token_dict = dict(zip(chars, range(len(chars))))
tokenizer = Tokenizer(token_dict)
tokenizer._token_unk_id = 0
bert_token_dict = load_vocab(dict_path)
keep_tokens = [bert_token_dict[c] for c in chars]
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(
' '.join(text), maxlen=maxlen // self.n + 1
)
batch_token_ids.append([0] + token_ids[1:-1])
batch_segment_ids.append([0] + segment_ids[1:-1])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
self.count += 1
@property
def n(self):
if not hasattr(self, 'count'):
self.count = 0
if self.count < 20000:
n = 8
elif self.count < 40000:
n = 4
elif self.count < 80000:
n = 2
else:
n = 1
return n
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉padding部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
if mask[1] is None:
y_mask = 1.0
else:
y_mask = K.cast(mask[1], K.floatx())[:, 1:]
y_true = y_true[:, 1:] # 目标token_ids
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class ChessPlayer(object):
"""交互式下棋程序
"""
def move_to_chinese(self, move):
"""将单步走法转为中文描述
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_chinese()
def move_to_iccs(self, move):
"""将单步走法转为iccs表示
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_iccs()
def print_board(self):
"""打印当前棋盘
直观起见,红方用红色表示,黑方用绿色表示。
"""
for l in self.board.dump_board():
for c in u'兵炮车马相仕帅':
l = l.replace(c, u'\033[1;31;40m%s\033[0m' % c)
for c in u'卒砲砗碼象士将':
l = l.replace(c, u'\033[1;32;40m%s\033[0m' % c)
print(l)
def movable_steps(self):
"""给出当前局面所有候选走法
"""
return [self.move_to_iccs(m) for m in self.board.create_moves()]
def human_input(self):
"""人类行棋
"""
while True:
try:
iccs = input(u'请输入iccs棋着: ')
print(iccs)
move = self.board.move_iccs(iccs)
if move is not None:
return iccs, move
except KeyboardInterrupt:
return None
except:
pass
def record(self, iccs):
"""将局面往前推进一步
"""
self.history += iccs
self.board.next_turn()
self.print_board()
self.current = (self.current + 1) % 2
def new_game(self, current=0):
"""开新局
"""
self.board = ChessBoard()
self.board.from_fen(FULL_INIT_FEN)
self.print_board()
self.history = ''
self.current = current
if self.current == 0: # 人类先手
iccs, move = self.human_input()
self.record(iccs)
while True:
# 机器走棋
moves = self.movable_steps()
iccses = [' '.join(self.history + m) for m in moves]
token_ids = [[0] + tokenizer.encode(ic)[0][1:-1] for ic in iccses]
token_ids = np.array(token_ids)
segment_ids = np.zeros_like(token_ids)
preds = model.predict([token_ids, segment_ids])[:, -5:-1]
preds = np.take_along_axis(preds, token_ids[:, -4:, None], axis=2)
preds = np.log(preds + 1e-8)[:, :, 0].sum(axis=1)
iccs = moves[preds.argmax()]
move = self.board.move_iccs(iccs)
self.record(iccs)
if self.board.is_win():
print(u'机器赢了')
break
# 人类走棋
iccs, move = self.human_input()
self.record(iccs)
if self.board.is_win():
print(u'人类赢了')
break
chessplayer = ChessPlayer()
"""
chessplayer.new_game(0) # 启动新棋局,0为人类先手,1为机器先手
"""
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def on_epoch_end(self, epoch, logs=None):
# 保存模型
model.save_weights('./best_model.weights')
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8- -*-
# 用Seq2Seq做阅读理解构建
# 根据篇章先采样生成答案,然后采样生成问题
# 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension
import json, os
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from bert4keras.snippets import text_segmentate
from keras.models import Model
from tqdm import tqdm
# 基本参数
max_p_len = 128
max_q_len = 64
max_a_len = 16
batch_size = 32
epochs = 100
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
# 标注数据
webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
# 筛选数据
seps, strips = u'\n。!?!?;;,, ', u';;,, '
data = []
for d in webqa_data + sogou_data:
for p in d['passages']:
if p['answer']:
for t in text_segmentate(p['passage'], max_p_len - 2, seps, strips):
if p['answer'] in t:
data.append((t, d['question'], p['answer']))
del webqa_data
del sogou_data
# 保存一个随机序(供划分valid用)
if not os.path.exists('../random_order.json'):
random_order = list(range(len(data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)
else:
random_order = json.load(open('../random_order.json'))
# 划分valid
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for is_end, (p, q, a) in self.sample(random):
p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len + 1)
a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len)
q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len)
token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:]
segment_ids = [0] * len(p_token_ids)
segment_ids += [1] * (len(token_ids) - len(p_token_ids))
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class QuestionAnswerGeneration(AutoRegressiveDecoder):
"""随机生成答案,并且通过beam search来生成问题
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, passage, topk=1, topp=0.95):
token_ids, segment_ids = tokenizer.encode(passage, maxlen=max_p_len)
a_ids = self.random_sample([token_ids, segment_ids], 1,
topp=topp)[0] # 基于随机采样
token_ids += list(a_ids)
segment_ids += [1] * len(a_ids)
q_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return (tokenizer.decode(q_ids), tokenizer.decode(a_ids))
qag = QuestionAnswerGeneration(
start_id=None, end_id=tokenizer._token_end_id, maxlen=max_q_len
)
def predict_to_file(data, filename, topk=1):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q, a = qag.generate(d[0])
s = '%s\t%s\t%s\n' % (q, a, d[0])
f.write(s)
f.flush()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=1000,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
# predict_to_file(valid_data, 'qa.csv')
#! -*- coding: utf-8 -*-
# 用MLM的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
# 10个epoch后在valid上能达到约0.77的分数
# (Accuracy=0.7282149325820084 F1=0.8207266829447049 Final=0.7744708077633566)
import json, os, re
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda
from keras.models import Model
from tqdm import tqdm
max_p_len = 256
max_q_len = 64
max_a_len = 32
batch_size = 32
epochs = 10
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
# 标注数据
webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
# 保存一个随机序(供划分valid用)
if not os.path.exists('../random_order.json'):
random_order = list(range(len(sogou_data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)
else:
random_order = json.load(open('../random_order.json'))
# 划分valid
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
"""单条样本格式为
输入:[CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
输出:答案
"""
batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
for is_end, D in self.sample(random):
question = D['question']
answers = [p['answer'] for p in D['passages'] if p['answer']]
passage = np.random.choice(D['passages'])['passage']
passage = re.sub(u' |、|;|,', ',', passage)
final_answer = ''
for answer in answers:
if all([
a in passage[:max_p_len - 2] for a in answer.split(' ')
]):
final_answer = answer.replace(' ', ',')
break
a_token_ids, _ = tokenizer.encode(
final_answer, maxlen=max_a_len + 1
)
q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
token_ids = [tokenizer._token_start_id]
token_ids += ([tokenizer._token_mask_id] * max_a_len)
token_ids += [tokenizer._token_end_id]
token_ids += (q_token_ids[1:] + p_token_ids[1:])
segment_ids = [0] * len(token_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_a_token_ids.append(a_token_ids[1:])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_a_token_ids = sequence_padding(
batch_a_token_ids, max_a_len
)
yield [batch_token_ids, batch_segment_ids], batch_a_token_ids
batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output)
model = Model(model.input, output)
model.summary()
def masked_cross_entropy(y_true, y_pred):
"""交叉熵作为loss,并mask掉padding部分的预测
"""
y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
return cross_entropy
model.compile(loss=masked_cross_entropy, optimizer=Adam(1e-5))
def get_ngram_set(x, n):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result = {}
for i in range(len(x) - n + 1):
k = tuple(x[i:i + n])
if k[:-1] not in result:
result[k[:-1]] = set()
result[k[:-1]].add(k[-1])
return result
def gen_answer(question, passages):
"""由于是MLM模型,所以可以直接argmax解码。
"""
all_p_token_ids, token_ids, segment_ids = [], [], []
for passage in passages:
passage = re.sub(u' |、|;|,', ',', passage)
p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
all_p_token_ids.append(p_token_ids[1:])
token_ids.append([tokenizer._token_start_id])
token_ids[-1] += ([tokenizer._token_mask_id] * max_a_len)
token_ids[-1] += [tokenizer._token_end_id]
token_ids[-1] += (q_token_ids[1:] + p_token_ids[1:])
segment_ids.append([0] * len(token_ids[-1]))
token_ids = sequence_padding(token_ids)
segment_ids = sequence_padding(segment_ids)
probas = model.predict([token_ids, segment_ids])
results = {}
for t, p in zip(all_p_token_ids, probas):
a, score = tuple(), 0.
for i in range(max_a_len):
idxs = list(get_ngram_set(t, i + 1)[a])
if tokenizer._token_end_id not in idxs:
idxs.append(tokenizer._token_end_id)
# pi是将passage以外的token的概率置零
pi = np.zeros_like(p[i])
pi[idxs] = p[i, idxs]
a = a + (pi.argmax(),)
score += pi.max()
if a[-1] == tokenizer._token_end_id:
break
score = score / (i + 1)
a = tokenizer.decode(a)
if a:
results[a] = results.get(a, []) + [score]
results = {
k: (np.array(v)**2).sum() / (sum(v) + 1)
for k, v in results.items()
}
return results
def max_in_dict(d):
if d:
return sorted(d.items(), key=lambda s: -s[1])[0][0]
def predict_to_file(data, filename):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q_text = d['question']
p_texts = [p['passage'] for p in d['passages']]
a = gen_answer(q_text, p_texts)
a = max_in_dict(a)
if a:
s = u'%s\t%s\n' % (d['id'], a)
else:
s = u'%s\t\n' % (d['id'])
f.write(s)
f.flush()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# 用seq2seq的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
# 8个epoch后在valid上能达到约0.77的分数
# (Accuracy=0.7259005836184343 F1=0.813860036706151 Final=0.7698803101622926)
import json, os, re
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from tqdm import tqdm
max_p_len = 256
max_q_len = 64
max_a_len = 32
max_qa_len = max_q_len + max_a_len
batch_size = 32
epochs = 8
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
# 标注数据
webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
# 保存一个随机序(供划分valid用)
if not os.path.exists('../random_order.json'):
random_order = list(range(len(sogou_data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)
else:
random_order = json.load(open('../random_order.json'))
# 划分valid
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
"""单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP]
"""
batch_token_ids, batch_segment_ids = [], []
for is_end, D in self.sample(random):
question = D['question']
answers = [p['answer'] for p in D['passages'] if p['answer']]
passage = np.random.choice(D['passages'])['passage']
passage = re.sub(u' |、|;|,', ',', passage)
final_answer = ''
for answer in answers:
if all([
a in passage[:max_p_len - 2] for a in answer.split(' ')
]):
final_answer = answer.replace(' ', ',')
break
qa_token_ids, qa_segment_ids = tokenizer.encode(
question, final_answer, maxlen=max_qa_len + 1
)
p_token_ids, p_segment_ids = tokenizer.encode(
passage, maxlen=max_p_len
)
token_ids = p_token_ids + qa_token_ids[1:]
segment_ids = p_segment_ids + qa_segment_ids[1:]
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class ReadingComprehension(AutoRegressiveDecoder):
"""beam search解码来生成答案
passages为多篇章组成的list,从多篇文章中自动决策出最优的答案,
如果没答案,则返回空字符串。
mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。
"""
def __init__(self, mode='extractive', **kwargs):
super(ReadingComprehension, self).__init__(**kwargs)
self.mode = mode
def get_ngram_set(self, x, n):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result = {}
for i in range(len(x) - n + 1):
k = tuple(x[i:i + n])
if k[:-1] not in result:
result[k[:-1]] = set()
result[k[:-1]].add(k[-1])
return result
@AutoRegressiveDecoder.wraps(default_rtype='probas', use_states=True)
def predict(self, inputs, output_ids, states):
inputs = [i for i in inputs if i[0, 0] > -1] # 过滤掉无答案篇章
topk = len(inputs[0])
all_token_ids, all_segment_ids = [], []
for token_ids in inputs: # inputs里每个元素都代表一个篇章
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.zeros_like(token_ids)
if states > 0:
segment_ids[:, -output_ids.shape[1]:] = 1
all_token_ids.extend(token_ids)
all_segment_ids.extend(segment_ids)
padded_all_token_ids = sequence_padding(all_token_ids)
padded_all_segment_ids = sequence_padding(all_segment_ids)
probas = model.predict([padded_all_token_ids, padded_all_segment_ids])
probas = [
probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids)
]
probas = np.array(probas).reshape((len(inputs), topk, -1))
if states == 0:
# 这一步主要是排除没有答案的篇章
# 如果一开始最大值就为end_id,那说明该篇章没有答案
argmax = probas[:, 0].argmax(axis=1)
available_idxs = np.where(argmax != self.end_id)[0]
if len(available_idxs) == 0:
scores = np.zeros_like(probas[0])
scores[:, self.end_id] = 1
return scores, states + 1
else:
for i in np.where(argmax == self.end_id)[0]:
inputs[i][:, 0] = -1 # 无答案篇章首位标记为-1
probas = probas[available_idxs]
inputs = [i for i in inputs if i[0, 0] > -1] # 过滤掉无答案篇章
if self.mode == 'extractive':
# 如果是抽取式,那么答案必须是篇章的一个片段
# 那么将非篇章片段的概率值全部置0
new_probas = np.zeros_like(probas)
ngrams = {}
for token_ids in inputs:
token_ids = token_ids[0]
sep_idx = np.where(token_ids == tokenizer._token_end_id)[0][0]
p_token_ids = token_ids[1:sep_idx]
for k, v in self.get_ngram_set(p_token_ids, states + 1).items():
ngrams[k] = ngrams.get(k, set()) | v
for i, ids in enumerate(output_ids):
available_idxs = ngrams.get(tuple(ids), set())
available_idxs.add(tokenizer._token_end_id)
available_idxs = list(available_idxs)
new_probas[:, i, available_idxs] = probas[:, i, available_idxs]
probas = new_probas
return (probas**2).sum(0) / (probas.sum(0) + 1), states + 1 # 某种平均投票方式
def answer(self, question, passages, topk=1):
token_ids = []
for passage in passages:
passage = re.sub(u' |、|;|,', ',', passage)
p_token_ids = tokenizer.encode(passage, maxlen=max_p_len)[0]
q_token_ids = tokenizer.encode(question, maxlen=max_q_len + 1)[0]
token_ids.append(p_token_ids + q_token_ids[1:])
output_ids = self.beam_search(
token_ids, topk=topk, states=0
) # 基于beam search
return tokenizer.decode(output_ids)
reader = ReadingComprehension(
start_id=None,
end_id=tokenizer._token_end_id,
maxlen=max_a_len,
mode='extractive'
)
def predict_to_file(data, filename, topk=1):
"""将预测结果输出到文件,方便评估
"""
with open(filename, 'w', encoding='utf-8') as f:
for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
q_text = d['question']
p_texts = [p['passage'] for p in d['passages']]
a = reader.answer(q_text, p_texts, topk)
if a:
s = u'%s\t%s\n' % (d['id'], a)
else:
s = u'%s\t\n' % (d['id'])
f.write(s)
f.flush()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于“半指针-半标注”结构
# 文章介绍:https://kexue.fm/archives/7161
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
# 最优f1=0.82198
# 换用RoBERTa Large可以达到f1=0.829+
# 说明:由于使用了EMA,需要跑足够多的步数(5000步以上)才生效,如果
# 你的数据总量比较少,那么请务必跑足够多的epoch数,或者去掉EMA。
import json
import numpy as np
from bert4keras.backend import keras, K, batch_gather
from bert4keras.layers import Loss
from bert4keras.layers import LayerNormalization
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_exponential_moving_average
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, to_array
from keras.layers import Input, Dense, Lambda, Reshape
from keras.models import Model
from tqdm import tqdm
maxlen = 128
batch_size = 64
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
D.append({
'text': l['text'],
'spo_list': [(spo['subject'], spo['predicate'], spo['object'])
for spo in l['spo_list']]
})
return D
# 加载数据集
train_data = load_data('/root/kg/datasets/train_data.json')
valid_data = load_data('/root/kg/datasets/dev_data.json')
predicate2id, id2predicate = {}, {}
with open('/root/kg/datasets/all_50_schemas') as f:
for l in f:
l = json.loads(l)
if l['predicate'] not in predicate2id:
id2predicate[len(predicate2id)] = l['predicate']
predicate2id[l['predicate']] = len(predicate2id)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
for is_end, d in self.sample(random):
token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
# 整理三元组 {s: [(o, p)]}
spoes = {}
for s, p, o in d['spo_list']:
s = tokenizer.encode(s)[0][1:-1]
p = predicate2id[p]
o = tokenizer.encode(o)[0][1:-1]
s_idx = search(s, token_ids)
o_idx = search(o, token_ids)
if s_idx != -1 and o_idx != -1:
s = (s_idx, s_idx + len(s) - 1)
o = (o_idx, o_idx + len(o) - 1, p)
if s not in spoes:
spoes[s] = []
spoes[s].append(o)
if spoes:
# subject标签
subject_labels = np.zeros((len(token_ids), 2))
for s in spoes:
subject_labels[s[0], 0] = 1
subject_labels[s[1], 1] = 1
# 随机选一个subject(这里没有实现错误!这就是想要的效果!!)
start, end = np.array(list(spoes.keys())).T
start = np.random.choice(start)
end = np.random.choice(end[end >= start])
subject_ids = (start, end)
# 对应的object标签
object_labels = np.zeros((len(token_ids), len(predicate2id), 2))
for o in spoes.get(subject_ids, []):
object_labels[o[0], o[2], 0] = 1
object_labels[o[1], o[2], 1] = 1
# 构建batch
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_subject_labels.append(subject_labels)
batch_subject_ids.append(subject_ids)
batch_object_labels.append(object_labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_subject_labels = sequence_padding(
batch_subject_labels
)
batch_subject_ids = np.array(batch_subject_ids)
batch_object_labels = sequence_padding(batch_object_labels)
yield [
batch_token_ids, batch_segment_ids,
batch_subject_labels, batch_subject_ids,
batch_object_labels
], None
batch_token_ids, batch_segment_ids = [], []
batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
def extract_subject(inputs):
"""根据subject_ids从output中取出subject的向量表征
"""
output, subject_ids = inputs
start = batch_gather(output, subject_ids[:, :1])
end = batch_gather(output, subject_ids[:, 1:])
subject = K.concatenate([start, end], 2)
return subject[:, 0]
# 补充输入
subject_labels = Input(shape=(None, 2), name='Subject-Labels')
subject_ids = Input(shape=(2,), name='Subject-Ids')
object_labels = Input(shape=(None, len(predicate2id), 2), name='Object-Labels')
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
return_keras_model=False,
)
# 预测subject
output = Dense(
units=2, activation='sigmoid', kernel_initializer=bert.initializer
)(bert.model.output)
subject_preds = Lambda(lambda x: x**2)(output)
subject_model = Model(bert.model.inputs, subject_preds)
# 传入subject,预测object
# 通过Conditional Layer Normalization将subject融入到object的预测中
output = bert.model.layers[-2].get_output_at(-1) # 自己想为什么是-2而不是-1
subject = Lambda(extract_subject)([output, subject_ids])
output = LayerNormalization(conditional=True)([output, subject])
output = Dense(
units=len(predicate2id) * 2,
activation='sigmoid',
kernel_initializer=bert.initializer
)(output)
output = Lambda(lambda x: x**4)(output)
object_preds = Reshape((-1, len(predicate2id), 2))(output)
object_model = Model(bert.model.inputs + [subject_ids], object_preds)
class TotalLoss(Loss):
"""subject_loss与object_loss之和,都是二分类交叉熵
"""
def compute_loss(self, inputs, mask=None):
subject_labels, object_labels = inputs[:2]
subject_preds, object_preds, _ = inputs[2:]
if mask[4] is None:
mask = 1.0
else:
mask = K.cast(mask[4], K.floatx())
# sujuect部分loss
subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
subject_loss = K.mean(subject_loss, 2)
subject_loss = K.sum(subject_loss * mask) / K.sum(mask)
# object部分loss
object_loss = K.binary_crossentropy(object_labels, object_preds)
object_loss = K.sum(K.mean(object_loss, 3), 2)
object_loss = K.sum(object_loss * mask) / K.sum(mask)
# 总的loss
return subject_loss + object_loss
subject_preds, object_preds = TotalLoss([2, 3])([
subject_labels, object_labels, subject_preds, object_preds,
bert.model.output
])
# 训练模型
train_model = Model(
bert.model.inputs + [subject_labels, subject_ids, object_labels],
[subject_preds, object_preds]
)
AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
optimizer = AdamEMA(learning_rate=1e-5)
train_model.compile(optimizer=optimizer)
def extract_spoes(text):
"""抽取输入text所包含的三元组
"""
tokens = tokenizer.tokenize(text, maxlen=maxlen)
mapping = tokenizer.rematch(text, tokens)
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
# 抽取subject
subject_preds = subject_model.predict([token_ids, segment_ids])
start = np.where(subject_preds[0, :, 0] > 0.6)[0]
end = np.where(subject_preds[0, :, 1] > 0.5)[0]
subjects = []
for i in start:
j = end[end >= i]
if len(j) > 0:
j = j[0]
subjects.append((i, j))
if subjects:
spoes = []
token_ids = np.repeat(token_ids, len(subjects), 0)
segment_ids = np.repeat(segment_ids, len(subjects), 0)
subjects = np.array(subjects)
# 传入subject,抽取object和predicate
object_preds = object_model.predict([token_ids, segment_ids, subjects])
for subject, object_pred in zip(subjects, object_preds):
start = np.where(object_pred[:, :, 0] > 0.6)
end = np.where(object_pred[:, :, 1] > 0.5)
for _start, predicate1 in zip(*start):
for _end, predicate2 in zip(*end):
if _start <= _end and predicate1 == predicate2:
spoes.append(
((mapping[subject[0]][0],
mapping[subject[1]][-1]), predicate1,
(mapping[_start][0], mapping[_end][-1]))
)
break
return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1])
for s, p, o, in spoes]
else:
return []
class SPO(tuple):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def __init__(self, spo):
self.spox = (
tuple(tokenizer.tokenize(spo[0])),
spo[1],
tuple(tokenizer.tokenize(spo[2])),
)
def __hash__(self):
return self.spox.__hash__()
def __eq__(self, spo):
return self.spox == spo.spox
def evaluate(data):
"""评估函数,计算f1、precision、recall
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
f = open('dev_pred.json', 'w', encoding='utf-8')
pbar = tqdm()
for d in data:
R = set([SPO(spo) for spo in extract_spoes(d['text'])])
T = set([SPO(spo) for spo in d['spo_list']])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
pbar.update()
pbar.set_description(
'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)
)
s = json.dumps({
'text': d['text'],
'spo_list': list(T),
'spo_list_pred': list(R),
'new': list(R - T),
'lack': list(T - R),
},
ensure_ascii=False,
indent=4)
f.write(s + '\n')
pbar.close()
f.close()
return f1, precision, recall
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, epoch, logs=None):
optimizer.apply_ema_weights()
f1, precision, recall = evaluate(valid_data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
train_model.save_weights('best_model.weights')
optimizer.reset_old_weights()
print(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
if __name__ == '__main__':
train_generator = data_generator(train_data, batch_size)
evaluator = Evaluator()
train_model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=20,
callbacks=[evaluator]
)
else:
train_model.load_weights('best_model.weights')
#! -*- coding:utf-8 -*-
# 句子对分类任务,LCQMC数据集
# val_acc: 0.887071, test_acc: 0.870320
import numpy as np
from bert4keras.backend import keras, set_gelu, K
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Dropout, Dense
set_gelu('tanh') # 切换gelu版本
maxlen = 128
batch_size = 64
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text1, text2, label = l.strip().split('\t')
D.append((text1, text2, int(label)))
return D
# 加载数据集
train_data = load_data('datasets/lcqmc/lcqmc.train.data')
valid_data = load_data('datasets/lcqmc/lcqmc.valid.data')
test_data = load_data('datasets/lcqmc/lcqmc.test.data')
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text1, text2, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(
text1, text2, maxlen=maxlen
)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
with_pool=True,
return_keras_model=False,
)
output = Dropout(rate=0.1)(bert.model.output)
output = Dense(
units=2, activation='softmax', kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
model.summary()
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(2e-5), # 用足够小的学习率
# optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}),
metrics=['accuracy'],
)
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
test_generator = data_generator(test_data, batch_size)
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(valid_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
test_acc = evaluate(test_generator)
print(
u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
(val_acc, self.best_val_acc, test_acc)
)
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=20,
callbacks=[evaluator]
)
model.load_weights('best_model.weights')
print(u'final test acc: %05f\n' % (evaluate(test_generator)))
else:
model.load_weights('best_model.weights')
#! -*- coding:utf-8 -*-
# 情感分析例子,加载albert_zh权重(https://github.com/brightmart/albert_zh)
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
set_gelu('tanh') # 切换gelu版本
num_classes = 2
maxlen = 128
batch_size = 32
config_path = '/root/kg/bert/albert_small_zh_google/albert_config.json'
checkpoint_path = '/root/kg/bert/albert_small_zh_google/albert_model.ckpt'
dict_path = '/root/kg/bert/albert_small_zh_google/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data('datasets/sentiment/sentiment.train.data')
valid_data = load_data('datasets/sentiment/sentiment.valid.data')
test_data = load_data('datasets/sentiment/sentiment.test.data')
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
model='albert',
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
units=num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
model.summary()
# 派生为带分段线性学习率的优化器。
# 其中name参数可选,但最好填入,以区分不同的派生优化器。
AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
model.compile(
loss='sparse_categorical_crossentropy',
# optimizer=Adam(1e-5), # 用足够小的学习率
optimizer=AdamLR(learning_rate=1e-4, lr_schedule={
1000: 1,
2000: 0.1
}),
metrics=['accuracy'],
)
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
test_generator = data_generator(test_data, batch_size)
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(valid_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
test_acc = evaluate(test_generator)
print(
u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
(val_acc, self.best_val_acc, test_acc)
)
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=10,
callbacks=[evaluator]
)
model.load_weights('best_model.weights')
print(u'final test acc: %05f\n' % (evaluate(test_generator)))
else:
model.load_weights('best_model.weights')
#! -*- coding: utf-8 -*-
# 通过积分梯度(Integrated Gradients)来给输入进行重要性排序
# 接 task_sentiment_albert.py
# 原始论文:https://arxiv.org/abs/1703.01365
# 博客介绍:https://kexue.fm/archives/7533
# 请读者务必先弄懂原理再看代码,下述代码仅是交互式演示代码,并非成品API
from task_sentiment_albert import *
from keras.layers import Layer, Input
from bert4keras.backend import K, batch_gather
from keras.models import Model
from bert4keras.snippets import uniout
class Gradient(Layer):
"""获取梯度的层
"""
def __init__(self, **kwargs):
super(Gradient, self).__init__(**kwargs)
self.supports_masking = True
def call(self, input):
input, output, label = input
output = batch_gather(output, label)
return K.gradients(output, [input])[0] * input
def compute_output_shape(self, input_shape):
return input_shape[0]
label_in = Input(shape=(1,)) # 指定标签
input = model.get_layer('Embedding-Token').output
output = model.output
grads = Gradient()([input, output, label_in])
grad_model = Model(model.inputs + [label_in], grads)
# 获取原始embedding层
embeddings = model.get_layer('Embedding-Token').embeddings
values = K.eval(embeddings)
text = u'这家店真黑心'
text = u'图太乱了 有点看不懂重点 讲故事的时候很难让孩子集中'
text = u'这是一本很好看的书'
text = u'这是一本很糟糕的书'
token_ids, segment_ids = tokenizer.encode(text)
preds = model.predict([[token_ids], [segment_ids]])
label = np.argmax(preds[0])
pred_grads = []
n = 20
for i in range(n):
# nlp任务中参照背景通常直接选零向量,所以这里
# 让embedding层从零渐变到原始值,以实现路径变换。
alpha = 1.0 * i / (n - 1)
K.set_value(embeddings, alpha * values)
pred_grad = grad_model.predict([[token_ids], [segment_ids], [[label]]])[0]
pred_grads.append(pred_grad)
# 然后求平均
pred_grads = np.mean(pred_grads, 0)
# 这时候我们得到形状为(seq_len, hidden_dim)的矩阵,我们要将它变换成(seq_len,)
# 这时候有两种方案:1、直接求模长;2、取绝对值后再取最大。两者效果差不多。
scores = np.sqrt((pred_grads**2).sum(axis=1))
scores = (scores - scores.min()) / (scores.max() - scores.min())
scores = scores.round(4)
results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)]
print(results[1:-1])
scores = np.abs(pred_grads).max(axis=1)
scores = (scores - scores.min()) / (scores.max() - scores.min())
scores = scores.round(4)
results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)]
print(results[1:-1])
#! -*- coding:utf-8 -*-
# 通过虚拟对抗训练进行半监督学习
# use_vat=True比use_vat=False约有1%的提升
# 数据集:情感分析数据集
# 博客:https://kexue.fm/archives/7466
# 适用于Keras 2.3.1
import json
import numpy as np
from bert4keras.backend import keras, search_layer, K
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
from keras.utils import to_categorical
from tqdm import tqdm
# 配置信息
num_classes = 2
maxlen = 128
batch_size = 32
train_frac = 0.01 # 标注数据的比例
use_vat = True # 可以比较True/False的效果
# BERT base
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
D.append((text, int(label)))
return D
# 加载数据集
train_data = load_data('datasets/sentiment/sentiment.train.data')
valid_data = load_data('datasets/sentiment/sentiment.valid.data')
test_data = load_data('datasets/sentiment/sentiment.test.data')
# 模拟标注和非标注数据
num_labeled = int(len(train_data) * train_frac)
unlabeled_data = [(t, 0) for t, l in train_data[num_labeled:]]
train_data = train_data[:num_labeled]
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(label)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = to_categorical(batch_labels, num_classes)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
# 转换数据集
train_generator = data_generator(train_data, batch_size)
valid_generator = data_generator(valid_data, batch_size)
test_generator = data_generator(test_data, batch_size)
# 加载预训练模型
bert = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
return_keras_model=False,
)
output = Lambda(lambda x: x[:, 0])(bert.model.output)
output = Dense(
units=num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
# 用于正常训练的模型
model = keras.models.Model(bert.model.input, output)
model.summary()
model.compile(
loss='kld',
optimizer=Adam(2e-5),
metrics=['categorical_accuracy'],
)
# 用于虚拟对抗训练的模型
model_vat = keras.models.Model(bert.model.input, output)
model_vat.compile(
loss='kld',
optimizer=Adam(1e-5),
metrics=['categorical_accuracy'],
)
def virtual_adversarial_training(
model, embedding_name, epsilon=1, xi=10, iters=1
):
"""给模型添加虚拟对抗训练
其中model是需要添加对抗训练的keras模型,embedding_name
则是model里边Embedding层的名字。要在模型compile之后使用。
"""
if model.train_function is None: # 如果还没有训练函数
model._make_train_function() # 手动make
old_train_function = model.train_function # 备份旧的训练函数
# 查找Embedding层
for output in model.outputs:
embedding_layer = search_layer(output, embedding_name)
if embedding_layer is not None:
break
if embedding_layer is None:
raise Exception('Embedding layer not found')
# 求Embedding梯度
embeddings = embedding_layer.embeddings # Embedding矩阵
gradients = K.gradients(model.total_loss, [embeddings]) # Embedding梯度
gradients = K.zeros_like(embeddings) + gradients[0] # 转为dense tensor
# 封装为函数
inputs = (
model._feed_inputs + model._feed_targets + model._feed_sample_weights
) # 所有输入层
model_outputs = K.function(
inputs=inputs,
outputs=model.outputs,
name='model_outputs',
) # 模型输出函数
embedding_gradients = K.function(
inputs=inputs,
outputs=[gradients],
name='embedding_gradients',
) # 模型梯度函数
def l2_normalize(x):
return x / (np.sqrt((x**2).sum()) + 1e-8)
def train_function(inputs): # 重新定义训练函数
outputs = model_outputs(inputs)
inputs = inputs[:2] + outputs + inputs[3:]
delta1, delta2 = 0.0, np.random.randn(*K.int_shape(embeddings))
for _ in range(iters): # 迭代求扰动
delta2 = xi * l2_normalize(delta2)
K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2)
delta1 = delta2
delta2 = embedding_gradients(inputs)[0] # Embedding梯度
delta2 = epsilon * l2_normalize(delta2)
K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2)
outputs = old_train_function(inputs) # 梯度下降
K.set_value(embeddings, K.eval(embeddings) - delta2) # 删除扰动
return outputs
model.train_function = train_function # 覆盖原训练函数
# 写好函数后,启用对抗训练只需要一行代码
virtual_adversarial_training(model_vat, 'Embedding-Token')
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true.argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
self.data = data_generator(unlabeled_data, batch_size).forfit()
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(valid_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
test_acc = evaluate(test_generator)
print(
u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
(val_acc, self.best_val_acc, test_acc)
)
def on_batch_end(self, batch, logs=None):
if use_vat:
dx, dy = next(self.data)
model_vat.train_on_batch(dx, dy)
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(
train_generator.forfit(),
steps_per_epoch=30,
epochs=100,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.weights')
#! -*- coding: utf-8 -*-
# 用Seq2Seq做小学数学应用题
# 数据集为ape210k:https://github.com/Chenny0808/ape210k
# Base版准确率为70%+,Large版准确率为73%+
# 实测环境:tensorflow 1.14 + keras 2.3.1 + bert4keras 0.8.8
# 介绍链接:https://kexue.fm/archives/7809
from __future__ import division
import json, re
import numpy as np
import pandas as pd
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from sympy import Integer
# 基本参数
maxlen = 192
batch_size = 32
epochs = 100
# bert配置
config_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_config.json'
checkpoint_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_model.ckpt'
dict_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/vocab.txt'
def is_equal(a, b):
"""比较两个结果是否相等
"""
a = round(float(a), 6)
b = round(float(b), 6)
return a == b
def remove_bucket(equation):
"""去掉冗余的括号
"""
l_buckets, buckets = [], []
for i, c in enumerate(equation):
if c == '(':
l_buckets.append(i)
elif c == ')':
buckets.append((l_buckets.pop(), i))
eval_equation = eval(equation)
for l, r in buckets:
new_equation = '%s %s %s' % (
equation[:l], equation[l + 1:r], equation[r + 1:]
)
try:
if is_equal(eval(new_equation.replace(' ', '')), eval_equation):
equation = new_equation
except:
pass
return equation.replace(' ', '')
def load_data(filename):
"""读取训练数据,并做一些标准化,保证equation是可以eval的
参考:https://kexue.fm/archives/7809
"""
D = []
for l in open(filename):
l = json.loads(l)
question, equation, answer = l['original_text'], l['equation'], l['ans']
# 处理带分数
question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
equation = re.sub('(\d+)\(', '\\1+(', equation)
answer = re.sub('(\d+)\(', '\\1+(', answer)
# 分数去括号
question = re.sub('\((\d+/\d+)\)', '\\1', question)
# 处理百分数
equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
# 冒号转除号、剩余百分号处理
equation = equation.replace(':', '/').replace('%', '/100')
answer = answer.replace(':', '/').replace('%', '/100')
if equation[:2] == 'x=':
equation = equation[2:]
try:
if is_equal(eval(equation), eval(answer)):
D.append((question, remove_bucket(equation), answer))
except:
continue
return D
# 加载数据集
train_data = load_data('/root/ape210k/train.ape.json')
valid_data = load_data('/root/ape210k/valid.ape.json')
test_data = load_data('/root/ape210k/test.ape.json')
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
for is_end, (question, equation, answer) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(
question, equation, maxlen=maxlen
)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(2e-5))
model.summary()
class AutoSolve(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, text, topk=1):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return tokenizer.decode(output_ids).replace(' ', '')
autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64)
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_acc = 0.
def on_epoch_end(self, epoch, logs=None):
metrics = self.evaluate(valid_data) # 评测模型
if metrics['acc'] >= self.best_acc:
self.best_acc = metrics['acc']
model.save_weights('./best_model.weights') # 保存模型
metrics['best_acc'] = self.best_acc
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total, right = 0.0, 0.0
for question, equation, answer in tqdm(data):
total += 1
pred_equation = autosolve.generate(question, topk)
try:
right += int(is_equal(eval(pred_equation), eval(answer)))
except:
pass
return {'acc': right / total}
def predict(in_file, out_file, topk=1):
"""输出预测结果到文件
该函数主要为比赛 https://www.datafountain.cn/competitions/467 所写,
主要是读取该比赛的测试集,然后预测equation,并且根据不同的问题输出不同格式的答案,
out_file可以直接提交到线上评测,线上准确率可以达到38%+。
"""
fw = open(out_file, 'w', encoding='utf-8')
raw_data = pd.read_csv(in_file, header=None, encoding='utf-8')
for i, question in tqdm(raw_data.values):
question = re.sub('(\d+)_(\d+/\d+)', '(\\1+\\2)', question)
pred_equation = autosolve.generate(question, topk)
if '.' not in pred_equation:
pred_equation = re.sub('([\d]+)', 'Integer(\\1)', pred_equation)
try:
pred_answer = eval(pred_equation)
except:
pred_answer = np.random.choice(21) + 1
if '.' in pred_equation:
if u'百分之几' in question:
pred_answer = pred_answer * 100
pred_answer = round(pred_answer, 2)
if int(pred_answer) == pred_answer:
pred_answer = int(pred_answer)
if (
re.findall(u'多少[辆|人|个|只|箱|包本|束|头|盒|张]', question) or
re.findall(u'几[辆|人|个|只|箱|包|本|束|头|盒|张]', question)
):
if re.findall(u'至少|最少', question):
pred_answer = np.ceil(pred_answer)
elif re.findall(u'至多|最多', question):
pred_answer = np.floor(pred_answer)
else:
pred_answer = np.ceil(pred_answer)
pred_answer = int(pred_answer)
pred_answer = str(pred_answer)
if u'百分之几' in question:
pred_answer = pred_answer + '%'
else:
pred_answer = str(pred_answer)
if '/' in pred_answer:
if re.findall('\d+/\d+', question):
a, b = pred_answer.split('/')
a, b = int(a), int(b)
if a > b:
pred_answer = '%s_%s/%s' % (a // b, a % b, b)
else:
if re.findall(u'至少|最少', question):
pred_answer = np.ceil(eval(pred_answer))
elif re.findall(u'至多|最多', question):
pred_answer = np.floor(eval(pred_answer))
else:
pred_answer = np.ceil(eval(pred_answer))
pred_answer = str(int(pred_answer))
fw.write(str(i) + ',' + pred_answer + '\n')
fw.flush()
fw.close()
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
from __future__ import print_function
import glob
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
# 基本参数
maxlen = 256
batch_size = 16
steps_per_epoch = 1000
epochs = 10000
# bert配置
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
# 训练样本。THUCNews数据集,每个样本保存为一个txt。
txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
for is_end, txt in self.sample(random):
text = open(txt, encoding='utf-8').read()
text = text.split('\n')
if len(text) > 1:
title = text[0]
content = '\n'.join(text[1:])
token_ids, segment_ids = tokenizer.encode(
content, title, maxlen=maxlen
)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, text, topk=1):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return tokenizer.decode(output_ids)
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
def just_show():
s1 = u'夏天来临,皮肤在强烈紫外线的照射下,晒伤不可避免,因此,晒后及时修复显得尤为重要,否则可能会造成长期伤害。专家表示,选择晒后护肤品要慎重,芦荟凝胶是最安全,有效的一种选择,晒伤严重者,还请及 时 就医 。'
s2 = u'8月28日,网络爆料称,华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看,数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录,住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证,认为数据真实性非常高。当天下午 ,华 住集 团发声明称,已在内部迅速开展核查,并第一时间报警。当晚,上海警方消息称,接到华住集团报案,警方已经介入调查。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
print()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(txts, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
from __future__ import print_function
import numpy as np
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
maxlen = 256
batch_size = 16
epochs = 20
# bert配置
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
title, content = l.strip().split('\t')
D.append((title, content))
return D
# 加载数据集
train_data = load_data('/root/csl/train.tsv')
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids = [], []
for is_end, (title, content) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(
content, title, maxlen=maxlen
)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
yield [batch_token_ids, batch_segment_ids], None
batch_token_ids, batch_segment_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
model = build_transformer_model(
config_path,
checkpoint_path,
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, text, topk=1):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return tokenizer.decode(output_ids)
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, epoch, logs=None):
metrics = self.evaluate(valid_data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
model.save_weights('./best_model.weights') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content, topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[title.split(' ')],
hypothesis=pred_title.split(' '),
smoothing_function=self.smooth
)
rouge_1 /= total
rouge_2 /= total
rouge_l /= total
bleu /= total
return {
'rouge-1': rouge_1,
'rouge-2': rouge_2,
'rouge-l': rouge_l,
'bleu': bleu,
}
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# 微调多国语言版T5做Seq2Seq任务
# 介绍链接:https://kexue.fm/archives/7867
# 细节请看:https://github.com/bojone/t5_in_bert4keras
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
from __future__ import print_function
import json
import numpy as np
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import SpTokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from rouge import Rouge # pip install rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 基本参数
max_c_len = 256
max_t_len = 32
batch_size = 16
epochs = 40
# 模型路径
config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json'
checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000'
spm_path = '/root/kg/bert/mt5/sentencepiece_cn.model'
keep_tokens_path = '/root/kg/bert/mt5/sentencepiece_cn_keep_tokens.json'
def load_data(filename):
"""加载数据
单条格式:(标题, 正文)
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
title, content = l.strip().split('\t')
D.append((title, content))
return D
# 加载数据集
train_data = load_data('/root/csl/train.tsv')
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')
# 加载分词器
tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>')
keep_tokens = json.load(open(keep_tokens_path))
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_c_token_ids, batch_t_token_ids = [], []
for is_end, (title, content) in self.sample(random):
c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
batch_c_token_ids.append(c_token_ids)
batch_t_token_ids.append([0] + t_token_ids)
if len(batch_c_token_ids) == self.batch_size or is_end:
batch_c_token_ids = sequence_padding(batch_c_token_ids)
batch_t_token_ids = sequence_padding(batch_t_token_ids)
yield [batch_c_token_ids, batch_t_token_ids], None
batch_c_token_ids, batch_t_token_ids = [], []
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = K.cast(mask[1], K.floatx())[:, 1:] # 解码器自带mask
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
t5 = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
keep_tokens=keep_tokens,
model='t5.1.1',
return_keras_model=False,
name='T5',
)
encoder = t5.encoder
decoder = t5.decoder
model = t5.model
model.summary()
output = CrossEntropy(1)([model.inputs[1], model.outputs[0]])
model = Model(model.inputs, output)
model.compile(optimizer=Adam(2e-4))
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
c_encoded = inputs[0]
return self.last_token(decoder).predict([c_encoded, output_ids])
def generate(self, text, topk=1):
c_token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
c_encoded = encoder.predict(np.array([c_token_ids]))[0]
output_ids = self.beam_search([c_encoded], topk=topk) # 基于beam search
return tokenizer.decode([int(i) for i in output_ids])
# 注:T5有一个很让人不解的设置,它的<bos>标记id是0,即<bos>和<pad>其实都是0
autotitle = AutoTitle(
start_id=0, end_id=tokenizer._token_end_id, maxlen=max_t_len
)
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.rouge = Rouge()
self.smooth = SmoothingFunction().method1
self.best_bleu = 0.
def on_epoch_end(self, epoch, logs=None):
metrics = self.evaluate(valid_data) # 评测模型
if metrics['bleu'] > self.best_bleu:
self.best_bleu = metrics['bleu']
model.save_weights('./best_model.weights') # 保存模型
metrics['best_bleu'] = self.best_bleu
print('valid_data:', metrics)
def evaluate(self, data, topk=1):
total = 0
rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
for title, content in tqdm(data):
total += 1
title = ' '.join(title).lower()
pred_title = ' '.join(autotitle.generate(content,
topk=topk)).lower()
if pred_title.strip():
scores = self.rouge.get_scores(hyps=pred_title, refs=title)
rouge_1 += scores[0]['rouge-1']['f']
rouge_2 += scores[0]['rouge-2']['f']
rouge_l += scores[0]['rouge-l']['f']
bleu += sentence_bleu(
references=[title.split(' ')],
hypothesis=pred_title.split(' '),
smoothing_function=self.smooth
)
rouge_1 /= total
rouge_2 /= total
rouge_l /= total
bleu /= total
return {
'rouge-1': rouge_1,
'rouge-2': rouge_2,
'rouge-l': rouge_l,
'bleu': bleu,
}
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 单机多卡版本,读者可以对照着 task_seq2seq_autotitle.py 阅读
from __future__ import print_function
import os
os.environ['TF_KERAS'] = '1' # 必须使用tf.keras
import glob
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
import tensorflow as tf # 导入tf,备用
# 基本参数
maxlen = 256
batch_size = 64
steps_per_epoch = 1000
epochs = 10000
# bert配置
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
# 训练样本。THUCNews数据集,每个样本保存为一个txt。
txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
(每次只需要返回一条样本)
"""
def __iter__(self, random=False):
for is_end, txt in self.sample(random):
text = open(txt, encoding='utf-8').read()
text = text.split('\n')
if len(text) > 1:
title = text[0]
content = '\n'.join(text[1:])
token_ids, segment_ids = tokenizer.encode(
content, title, maxlen=maxlen
)
# 返回一条样本
yield token_ids, segment_ids
class CrossEntropy(Loss):
"""交叉熵作为loss,并mask掉输入部分
"""
def compute_loss(self, inputs, mask=None):
y_true, y_mask, y_pred = inputs
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分
y_pred = y_pred[:, :-1] # 预测序列,错开一位
loss = K.sparse_categorical_crossentropy(y_true, y_pred)
loss = K.sum(loss * y_mask) / K.sum(y_mask)
return loss
strategy = tf.distribute.MirroredStrategy() # 建立单机多卡策略
with strategy.scope(): # 调用该策略
bert = build_transformer_model(
config_path,
checkpoint_path=None, # 此时可以不加载预训练权重
application='unilm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
return_keras_model=False, # 返回bert4keras类,而不是keras模型
)
model = bert.model # 这个才是keras模型
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()
bert.load_weights_from_checkpoint(checkpoint_path) # 必须最后才加载预训练权重
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = np.concatenate([token_ids, output_ids], 1)
segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
return self.last_token(model).predict([token_ids, segment_ids])
def generate(self, text, topk=1):
max_c_len = maxlen - self.maxlen
token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
output_ids = self.beam_search([token_ids, segment_ids],
topk=topk) # 基于beam search
return tokenizer.decode(output_ids)
autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
def just_show():
s1 = u'夏天来临,皮肤在强烈紫外线的照射下,晒伤不可避免,因此,晒后及时修复显得尤为重要,否则可能会造成长期伤害。专家表示,选择晒后护肤品要慎重,芦荟凝胶是最安全,有效的一种选择,晒伤严重者,还请及 时 就医 。'
s2 = u'8月28日,网络爆料称,华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看,数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录,住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证,认为数据真实性非常高。当天下午 ,华 住集 团发声明称,已在内部迅速开展核查,并第一时间报警。当晚,上海警方消息称,接到华住集团报案,警方已经介入调查。'
for s in [s1, s2]:
print(u'生成标题:', autotitle.generate(s))
print()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
model.save_weights('./best_model.weights')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(txts, batch_size)
dataset = train_generator.to_dataset(
types=('float32', 'float32'),
shapes=([None], [None]), # 配合后面的padded_batch=True,实现自动padding
names=('Input-Token', 'Input-Segment'),
padded_batch=True
) # 数据要转为tf.data.Dataset格式,names跟输入层的名字对应
model.fit(
dataset,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
#! -*- coding: utf-8 -*-
# 用CRF做中文分词(CWS, Chinese Word Segmentation)
# 数据集 http://sighan.cs.uchicago.edu/bakeoff2005/
# 最后测试集的F1约为96.1%
import re, os, json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
maxlen = 256
epochs = 10
num_labels = 4
batch_size = 32
bert_layers = 12
learning_rate = 1e-5 # bert_layers越小,学习率应该要越大
crf_lr_multiplier = 1 # 必要时扩大CRF层的学习率
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:[词1, 词2, 词3, ...]
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(re.split(' +', l.strip()))
return D
# 标注数据
data = load_data('/root/icwb2-data/training/pku_training.utf8')
# 保存一个随机序(供划分valid用)
if not os.path.exists('../random_order.json'):
random_order = list(range(len(data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)
else:
random_order = json.load(open('../random_order.json'))
# 划分valid
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
"""标签含义
0: 单字词; 1: 多字词首字; 2: 多字词中间; 3: 多字词末字
"""
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, item in self.sample(random):
token_ids, labels = [tokenizer._token_start_id], [0]
for w in item:
w_token_ids = tokenizer.encode(w)[0][1:-1]
if len(token_ids) + len(w_token_ids) < maxlen:
token_ids += w_token_ids
if len(w_token_ids) == 1:
labels += [0]
else:
labels += [1] + [2] * (len(w_token_ids) - 2) + [3]
else:
break
token_ids += [tokenizer._token_end_id]
labels += [0]
segment_ids = [0] * len(token_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
"""
后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
model = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
output_layer = 'Transformer-FeedForward-Norm'
output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
"""
model = build_transformer_model(
config_path,
checkpoint_path,
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(num_labels)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
model.compile(
loss=CRF.sparse_loss,
optimizer=Adam(learning_rate),
metrics=[CRF.sparse_accuracy]
)
class WordSegmenter(ViterbiDecoder):
"""基本分词器
"""
def tokenize(self, text):
tokens = tokenizer.tokenize(text)
while len(tokens) > 512:
tokens.pop(-2)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
nodes = model.predict([token_ids, segment_ids])[0]
labels = self.decode(nodes)
words = []
for i, label in enumerate(labels[1:-1]):
if label < 2 or len(words) == 0:
words.append([i + 1])
else:
words[-1].append(i + 1)
return [text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words]
segmenter = WordSegmenter(trans=K.eval(CRF.trans), starts=[0], ends=[0])
def simple_evaluate(data):
"""简单的评测
该评测指标不等价于官方的评测指标,但基本呈正相关关系,
可以用来快速筛选模型。
"""
total, right = 0., 0.
for w_true in tqdm(data):
w_pred = segmenter.tokenize(''.join(w_true))
w_pred = set(w_pred)
w_true = set(w_true)
total += len(w_true)
right += len(w_true & w_pred)
return right / total
def predict_to_file(in_file, out_file):
"""预测结果到文件,便于用官方脚本评测
使用示例:
predict_to_file('/root/icwb2-data/testing/pku_test.utf8', 'myresult.txt')
官方评测代码示例:
data_dir="/root/icwb2-data"
$data_dir/scripts/score $data_dir/gold/pku_training_words.utf8 $data_dir/gold/pku_test_gold.utf8 myresult.txt > myscore.txt
(执行完毕后查看myscore.txt的内容末尾)
"""
fw = open(out_file, 'w', encoding='utf-8')
with open(in_file, encoding='utf-8') as fr:
for l in tqdm(fr):
l = l.strip()
if l:
l = ' '.join(segmenter.tokenize(l))
fw.write(l + '\n')
fw.close()
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0
def on_epoch_end(self, epoch, logs=None):
trans = K.eval(CRF.trans)
segmenter.trans = trans
print(segmenter.trans)
acc = simple_evaluate(valid_data)
# 保存最优
if acc >= self.best_val_acc:
self.best_val_acc = acc
model.save_weights('./best_model.weights')
print('acc: %.5f, best acc: %.5f' % (acc, self.best_val_acc))
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
segmenter.trans = K.eval(CRF.trans)
#! -*- coding: utf-8 -*-
# 用CRF做中文命名实体识别
# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 实测验证集的F1可以到96.18%,测试集的F1可以到95.35%
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
maxlen = 256
epochs = 10
batch_size = 32
bert_layers = 12
learning_rate = 1e-5 # bert_layers越小,学习率应该要越大
crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
def load_data(filename):
"""加载数据
单条格式:[(片段1, 标签1), (片段2, 标签2), (片段3, 标签3), ...]
"""
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d, last_flag = [], ''
for c in l.split('\n'):
char, this_flag = c.split(' ')
if this_flag == 'O' and last_flag == 'O':
d[-1][0] += char
elif this_flag == 'O' and last_flag != 'O':
d.append([char, 'O'])
elif this_flag[:1] == 'B':
d.append([char, this_flag[2:]])
else:
d[-1][0] += char
last_flag = this_flag
D.append(d)
return D
# 标注数据
train_data = load_data('/root/ner/china-people-daily-ner-corpus/example.train')
valid_data = load_data('/root/ner/china-people-daily-ner-corpus/example.dev')
test_data = load_data('/root/ner/china-people-daily-ner-corpus/example.test')
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 类别映射
labels = ['PER', 'LOC', 'ORG']
id2label = dict(enumerate(labels))
label2id = {j: i for i, j in id2label.items()}
num_labels = len(labels) * 2 + 1
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, item in self.sample(random):
token_ids, labels = [tokenizer._token_start_id], [0]
for w, l in item:
w_token_ids = tokenizer.encode(w)[0][1:-1]
if len(token_ids) + len(w_token_ids) < maxlen:
token_ids += w_token_ids
if l == 'O':
labels += [0] * len(w_token_ids)
else:
B = label2id[l] * 2 + 1
I = label2id[l] * 2 + 2
labels += ([B] + [I] * (len(w_token_ids) - 1))
else:
break
token_ids += [tokenizer._token_end_id]
labels += [0]
segment_ids = [0] * len(token_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
"""
后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
model = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
output_layer = 'Transformer-FeedForward-Norm'
output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
"""
model = build_transformer_model(
config_path,
checkpoint_path,
)
output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(num_labels)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
model.compile(
loss=CRF.sparse_loss,
optimizer=Adam(learning_rate),
metrics=[CRF.sparse_accuracy]
)
class NamedEntityRecognizer(ViterbiDecoder):
"""命名实体识别器
"""
def recognize(self, text):
tokens = tokenizer.tokenize(text)
while len(tokens) > 512:
tokens.pop(-2)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
token_ids, segment_ids = to_array([token_ids], [segment_ids])
nodes = model.predict([token_ids, segment_ids])[0]
labels = self.decode(nodes)
entities, starting = [], False
for i, label in enumerate(labels):
if label > 0:
if label % 2 == 1:
starting = True
entities.append([[i], id2label[(label - 1) // 2]])
elif starting:
entities[-1][0].append(i)
else:
starting = False
else:
starting = False
return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
for w, l in entities]
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
def evaluate(data):
"""评测函数
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
for d in tqdm(data):
text = ''.join([i[0] for i in d])
R = set(NER.recognize(text))
T = set([tuple(i) for i in d if i[1] != 'O'])
X += len(R & T)
Y += len(R)
Z += len(T)
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
class Evaluator(keras.callbacks.Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_f1 = 0
def on_epoch_end(self, epoch, logs=None):
trans = K.eval(CRF.trans)
NER.trans = trans
print(NER.trans)
f1, precision, recall = evaluate(valid_data)
# 保存最优
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
model.save_weights('./best_model.weights')
print(
'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
f1, precision, recall = evaluate(test_data)
print(
'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
(f1, precision, recall)
)
if __name__ == '__main__':
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)
model.fit(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('./best_model.weights')
NER.trans = K.eval(CRF.trans)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment