Commit 0e29b9b7 authored by xuxo's avatar xuxo
Browse files

yidong infer init

parents
Pipeline #3252 failed with stages
in 0 seconds
#! -*- coding: utf-8 -*-
# 将cloudwalk的预训练bart模型转换为bert4torch可用的权重
# 权重链接百度云地址:
import torch
ckpt_file = 'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/pytorch_base_model_2024000.pt'
torch_weights = torch.load(ckpt_file)
map = {'bart.embeddings.word_embeddings.weight': 'encoder.embed_tokens.weight',
'bart.embeddings.position_embeddings.weight': 'encoder.embed_positions.weight',
'bart.embeddings.LayerNorm.weight': 'encoder.layernorm_embedding.weight',
'bart.embeddings.LayerNorm.bias': 'encoder.layernorm_embedding.bias',
'bart.encoder.encoder_layer.0.attention.self.query.weight': 'encoder.layers.0.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.query.bias': 'encoder.layers.0.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.0.attention.self.key.weight': 'encoder.layers.0.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.key.bias': 'encoder.layers.0.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.0.attention.self.value.weight': 'encoder.layers.0.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.value.bias': 'encoder.layers.0.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.0.attention.output.dense.weight': 'encoder.layers.0.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.0.attention.output.dense.bias': 'encoder.layers.0.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.weight': 'encoder.layers.0.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.bias': 'encoder.layers.0.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.0.intermediate.dense.weight': 'encoder.layers.0.fc1.weight',
'bart.encoder.encoder_layer.0.intermediate.dense.bias': 'encoder.layers.0.fc1.bias',
'bart.encoder.encoder_layer.0.output.dense.weight': 'encoder.layers.0.fc2.weight',
'bart.encoder.encoder_layer.0.output.dense.bias': 'encoder.layers.0.fc2.bias',
'bart.encoder.encoder_layer.0.output.LayerNorm.weight': 'encoder.layers.0.final_layer_norm.weight',
'bart.encoder.encoder_layer.0.output.LayerNorm.bias': 'encoder.layers.0.final_layer_norm.bias',
'bart.encoder.encoder_layer.1.attention.self.query.weight': 'encoder.layers.1.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.query.bias': 'encoder.layers.1.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.1.attention.self.key.weight': 'encoder.layers.1.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.key.bias': 'encoder.layers.1.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.1.attention.self.value.weight': 'encoder.layers.1.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.value.bias': 'encoder.layers.1.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.1.attention.output.dense.weight': 'encoder.layers.1.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.1.attention.output.dense.bias': 'encoder.layers.1.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.weight': 'encoder.layers.1.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.bias': 'encoder.layers.1.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.1.intermediate.dense.weight': 'encoder.layers.1.fc1.weight',
'bart.encoder.encoder_layer.1.intermediate.dense.bias': 'encoder.layers.1.fc1.bias',
'bart.encoder.encoder_layer.1.output.dense.weight': 'encoder.layers.1.fc2.weight',
'bart.encoder.encoder_layer.1.output.dense.bias': 'encoder.layers.1.fc2.bias',
'bart.encoder.encoder_layer.1.output.LayerNorm.weight': 'encoder.layers.1.final_layer_norm.weight',
'bart.encoder.encoder_layer.1.output.LayerNorm.bias': 'encoder.layers.1.final_layer_norm.bias',
'bart.encoder.encoder_layer.2.attention.self.query.weight': 'encoder.layers.2.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.query.bias': 'encoder.layers.2.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.2.attention.self.key.weight': 'encoder.layers.2.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.key.bias': 'encoder.layers.2.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.2.attention.self.value.weight': 'encoder.layers.2.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.value.bias': 'encoder.layers.2.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.2.attention.output.dense.weight': 'encoder.layers.2.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.2.attention.output.dense.bias': 'encoder.layers.2.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.weight': 'encoder.layers.2.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.bias': 'encoder.layers.2.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.2.intermediate.dense.weight': 'encoder.layers.2.fc1.weight',
'bart.encoder.encoder_layer.2.intermediate.dense.bias': 'encoder.layers.2.fc1.bias',
'bart.encoder.encoder_layer.2.output.dense.weight': 'encoder.layers.2.fc2.weight',
'bart.encoder.encoder_layer.2.output.dense.bias': 'encoder.layers.2.fc2.bias',
'bart.encoder.encoder_layer.2.output.LayerNorm.weight': 'encoder.layers.2.final_layer_norm.weight',
'bart.encoder.encoder_layer.2.output.LayerNorm.bias': 'encoder.layers.2.final_layer_norm.bias',
'bart.encoder.encoder_layer.3.attention.self.query.weight': 'encoder.layers.3.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.query.bias': 'encoder.layers.3.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.3.attention.self.key.weight': 'encoder.layers.3.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.key.bias': 'encoder.layers.3.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.3.attention.self.value.weight': 'encoder.layers.3.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.value.bias': 'encoder.layers.3.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.3.attention.output.dense.weight': 'encoder.layers.3.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.3.attention.output.dense.bias': 'encoder.layers.3.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.weight': 'encoder.layers.3.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.bias': 'encoder.layers.3.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.3.intermediate.dense.weight': 'encoder.layers.3.fc1.weight',
'bart.encoder.encoder_layer.3.intermediate.dense.bias': 'encoder.layers.3.fc1.bias',
'bart.encoder.encoder_layer.3.output.dense.weight': 'encoder.layers.3.fc2.weight',
'bart.encoder.encoder_layer.3.output.dense.bias': 'encoder.layers.3.fc2.bias',
'bart.encoder.encoder_layer.3.output.LayerNorm.weight': 'encoder.layers.3.final_layer_norm.weight',
'bart.encoder.encoder_layer.3.output.LayerNorm.bias': 'encoder.layers.3.final_layer_norm.bias',
'bart.encoder.encoder_layer.4.attention.self.query.weight': 'encoder.layers.4.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.query.bias': 'encoder.layers.4.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.4.attention.self.key.weight': 'encoder.layers.4.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.key.bias': 'encoder.layers.4.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.4.attention.self.value.weight': 'encoder.layers.4.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.value.bias': 'encoder.layers.4.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.4.attention.output.dense.weight': 'encoder.layers.4.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.4.attention.output.dense.bias': 'encoder.layers.4.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.weight': 'encoder.layers.4.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.bias': 'encoder.layers.4.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.4.intermediate.dense.weight': 'encoder.layers.4.fc1.weight',
'bart.encoder.encoder_layer.4.intermediate.dense.bias': 'encoder.layers.4.fc1.bias',
'bart.encoder.encoder_layer.4.output.dense.weight': 'encoder.layers.4.fc2.weight',
'bart.encoder.encoder_layer.4.output.dense.bias': 'encoder.layers.4.fc2.bias',
'bart.encoder.encoder_layer.4.output.LayerNorm.weight': 'encoder.layers.4.final_layer_norm.weight',
'bart.encoder.encoder_layer.4.output.LayerNorm.bias': 'encoder.layers.4.final_layer_norm.bias',
'bart.encoder.encoder_layer.5.attention.self.query.weight': 'encoder.layers.5.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.query.bias': 'encoder.layers.5.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.5.attention.self.key.weight': 'encoder.layers.5.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.key.bias': 'encoder.layers.5.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.5.attention.self.value.weight': 'encoder.layers.5.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.value.bias': 'encoder.layers.5.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.5.attention.output.dense.weight': 'encoder.layers.5.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.5.attention.output.dense.bias': 'encoder.layers.5.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.weight': 'encoder.layers.5.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.bias': 'encoder.layers.5.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.5.intermediate.dense.weight': 'encoder.layers.5.fc1.weight',
'bart.encoder.encoder_layer.5.intermediate.dense.bias': 'encoder.layers.5.fc1.bias',
'bart.encoder.encoder_layer.5.output.dense.weight': 'encoder.layers.5.fc2.weight',
'bart.encoder.encoder_layer.5.output.dense.bias': 'encoder.layers.5.fc2.bias',
'bart.encoder.encoder_layer.5.output.LayerNorm.weight': 'encoder.layers.5.final_layer_norm.weight',
'bart.encoder.encoder_layer.5.output.LayerNorm.bias': 'encoder.layers.5.final_layer_norm.bias',
'bart.decoder.decoder_layer.0.attention.self.query.weight': 'decoder.layers.0.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.query.bias': 'decoder.layers.0.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.0.attention.self.key.weight': 'decoder.layers.0.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.key.bias': 'decoder.layers.0.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.0.attention.self.value.weight': 'decoder.layers.0.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.value.bias': 'decoder.layers.0.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.0.attention.output.dense.weight': 'decoder.layers.0.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.0.attention.output.dense.bias': 'decoder.layers.0.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.weight': 'decoder.layers.0.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.bias': 'decoder.layers.0.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.0.crossattention.self.query.weight': 'decoder.layers.0.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.query.bias': 'decoder.layers.0.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.self.key.weight': 'decoder.layers.0.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.key.bias': 'decoder.layers.0.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.self.value.weight': 'decoder.layers.0.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.value.bias': 'decoder.layers.0.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.output.dense.weight': 'decoder.layers.0.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.output.dense.bias': 'decoder.layers.0.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.weight': 'decoder.layers.0.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.bias': 'decoder.layers.0.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.0.intermediate.dense.weight': 'decoder.layers.0.fc1.weight',
'bart.decoder.decoder_layer.0.intermediate.dense.bias': 'decoder.layers.0.fc1.bias',
'bart.decoder.decoder_layer.0.output.dense.weight': 'decoder.layers.0.fc2.weight',
'bart.decoder.decoder_layer.0.output.dense.bias': 'decoder.layers.0.fc2.bias',
'bart.decoder.decoder_layer.0.output.LayerNorm.weight': 'decoder.layers.0.final_layer_norm.weight',
'bart.decoder.decoder_layer.0.output.LayerNorm.bias': 'decoder.layers.0.final_layer_norm.bias',
'bart.decoder.decoder_layer.1.attention.self.query.weight': 'decoder.layers.1.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.query.bias': 'decoder.layers.1.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.1.attention.self.key.weight': 'decoder.layers.1.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.key.bias': 'decoder.layers.1.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.1.attention.self.value.weight': 'decoder.layers.1.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.value.bias': 'decoder.layers.1.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.1.attention.output.dense.weight': 'decoder.layers.1.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.1.attention.output.dense.bias': 'decoder.layers.1.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.weight': 'decoder.layers.1.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.bias': 'decoder.layers.1.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.1.crossattention.self.query.weight': 'decoder.layers.1.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.query.bias': 'decoder.layers.1.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.self.key.weight': 'decoder.layers.1.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.key.bias': 'decoder.layers.1.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.self.value.weight': 'decoder.layers.1.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.value.bias': 'decoder.layers.1.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.output.dense.weight': 'decoder.layers.1.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.output.dense.bias': 'decoder.layers.1.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.weight': 'decoder.layers.1.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.bias': 'decoder.layers.1.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.1.intermediate.dense.weight': 'decoder.layers.1.fc1.weight',
'bart.decoder.decoder_layer.1.intermediate.dense.bias': 'decoder.layers.1.fc1.bias',
'bart.decoder.decoder_layer.1.output.dense.weight': 'decoder.layers.1.fc2.weight',
'bart.decoder.decoder_layer.1.output.dense.bias': 'decoder.layers.1.fc2.bias',
'bart.decoder.decoder_layer.1.output.LayerNorm.weight': 'decoder.layers.1.final_layer_norm.weight',
'bart.decoder.decoder_layer.1.output.LayerNorm.bias': 'decoder.layers.1.final_layer_norm.bias',
'bart.decoder.decoder_layer.2.attention.self.query.weight': 'decoder.layers.2.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.query.bias': 'decoder.layers.2.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.2.attention.self.key.weight': 'decoder.layers.2.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.key.bias': 'decoder.layers.2.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.2.attention.self.value.weight': 'decoder.layers.2.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.value.bias': 'decoder.layers.2.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.2.attention.output.dense.weight': 'decoder.layers.2.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.2.attention.output.dense.bias': 'decoder.layers.2.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.weight': 'decoder.layers.2.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.bias': 'decoder.layers.2.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.2.crossattention.self.query.weight': 'decoder.layers.2.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.query.bias': 'decoder.layers.2.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.self.key.weight': 'decoder.layers.2.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.key.bias': 'decoder.layers.2.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.self.value.weight': 'decoder.layers.2.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.value.bias': 'decoder.layers.2.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.output.dense.weight': 'decoder.layers.2.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.output.dense.bias': 'decoder.layers.2.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.weight': 'decoder.layers.2.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.bias': 'decoder.layers.2.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.2.intermediate.dense.weight': 'decoder.layers.2.fc1.weight',
'bart.decoder.decoder_layer.2.intermediate.dense.bias': 'decoder.layers.2.fc1.bias',
'bart.decoder.decoder_layer.2.output.dense.weight': 'decoder.layers.2.fc2.weight',
'bart.decoder.decoder_layer.2.output.dense.bias': 'decoder.layers.2.fc2.bias',
'bart.decoder.decoder_layer.2.output.LayerNorm.weight': 'decoder.layers.2.final_layer_norm.weight',
'bart.decoder.decoder_layer.2.output.LayerNorm.bias': 'decoder.layers.2.final_layer_norm.bias',
'bart.decoder.decoder_layer.3.attention.self.query.weight': 'decoder.layers.3.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.query.bias': 'decoder.layers.3.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.3.attention.self.key.weight': 'decoder.layers.3.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.key.bias': 'decoder.layers.3.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.3.attention.self.value.weight': 'decoder.layers.3.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.value.bias': 'decoder.layers.3.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.3.attention.output.dense.weight': 'decoder.layers.3.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.3.attention.output.dense.bias': 'decoder.layers.3.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.weight': 'decoder.layers.3.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.bias': 'decoder.layers.3.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.3.crossattention.self.query.weight': 'decoder.layers.3.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.query.bias': 'decoder.layers.3.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.self.key.weight': 'decoder.layers.3.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.key.bias': 'decoder.layers.3.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.self.value.weight': 'decoder.layers.3.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.value.bias': 'decoder.layers.3.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.output.dense.weight': 'decoder.layers.3.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.output.dense.bias': 'decoder.layers.3.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.weight': 'decoder.layers.3.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.bias': 'decoder.layers.3.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.3.intermediate.dense.weight': 'decoder.layers.3.fc1.weight',
'bart.decoder.decoder_layer.3.intermediate.dense.bias': 'decoder.layers.3.fc1.bias',
'bart.decoder.decoder_layer.3.output.dense.weight': 'decoder.layers.3.fc2.weight',
'bart.decoder.decoder_layer.3.output.dense.bias': 'decoder.layers.3.fc2.bias',
'bart.decoder.decoder_layer.3.output.LayerNorm.weight': 'decoder.layers.3.final_layer_norm.weight',
'bart.decoder.decoder_layer.3.output.LayerNorm.bias': 'decoder.layers.3.final_layer_norm.bias',
'bart.decoder.decoder_layer.4.attention.self.query.weight': 'decoder.layers.4.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.query.bias': 'decoder.layers.4.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.4.attention.self.key.weight': 'decoder.layers.4.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.key.bias': 'decoder.layers.4.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.4.attention.self.value.weight': 'decoder.layers.4.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.value.bias': 'decoder.layers.4.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.4.attention.output.dense.weight': 'decoder.layers.4.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.4.attention.output.dense.bias': 'decoder.layers.4.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.weight': 'decoder.layers.4.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.bias': 'decoder.layers.4.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.4.crossattention.self.query.weight': 'decoder.layers.4.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.query.bias': 'decoder.layers.4.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.self.key.weight': 'decoder.layers.4.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.key.bias': 'decoder.layers.4.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.self.value.weight': 'decoder.layers.4.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.value.bias': 'decoder.layers.4.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.output.dense.weight': 'decoder.layers.4.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.output.dense.bias': 'decoder.layers.4.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.weight': 'decoder.layers.4.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.bias': 'decoder.layers.4.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.4.intermediate.dense.weight': 'decoder.layers.4.fc1.weight',
'bart.decoder.decoder_layer.4.intermediate.dense.bias': 'decoder.layers.4.fc1.bias',
'bart.decoder.decoder_layer.4.output.dense.weight': 'decoder.layers.4.fc2.weight',
'bart.decoder.decoder_layer.4.output.dense.bias': 'decoder.layers.4.fc2.bias',
'bart.decoder.decoder_layer.4.output.LayerNorm.weight': 'decoder.layers.4.final_layer_norm.weight',
'bart.decoder.decoder_layer.4.output.LayerNorm.bias': 'decoder.layers.4.final_layer_norm.bias',
'bart.decoder.decoder_layer.5.attention.self.query.weight': 'decoder.layers.5.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.query.bias': 'decoder.layers.5.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.5.attention.self.key.weight': 'decoder.layers.5.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.key.bias': 'decoder.layers.5.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.5.attention.self.value.weight': 'decoder.layers.5.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.value.bias': 'decoder.layers.5.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.5.attention.output.dense.weight': 'decoder.layers.5.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.5.attention.output.dense.bias': 'decoder.layers.5.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.weight': 'decoder.layers.5.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.bias': 'decoder.layers.5.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.5.crossattention.self.query.weight': 'decoder.layers.5.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.query.bias': 'decoder.layers.5.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.self.key.weight': 'decoder.layers.5.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.key.bias': 'decoder.layers.5.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.self.value.weight': 'decoder.layers.5.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.value.bias': 'decoder.layers.5.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.output.dense.weight': 'decoder.layers.5.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.output.dense.bias': 'decoder.layers.5.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.weight': 'decoder.layers.5.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.bias': 'decoder.layers.5.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.5.intermediate.dense.weight': 'decoder.layers.5.fc1.weight',
'bart.decoder.decoder_layer.5.intermediate.dense.bias': 'decoder.layers.5.fc1.bias',
'bart.decoder.decoder_layer.5.output.dense.weight': 'decoder.layers.5.fc2.weight',
'bart.decoder.decoder_layer.5.output.dense.bias': 'decoder.layers.5.fc2.bias',
'bart.decoder.decoder_layer.5.output.LayerNorm.weight': 'decoder.layers.5.final_layer_norm.weight',
'bart.decoder.decoder_layer.5.output.LayerNorm.bias': 'decoder.layers.5.final_layer_norm.bias'}
model_new = {}
for key, value in map.items():
model_new[value] = torch_weights[key]
torch.save(model_new, 'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/bert4torch_pytorch_model.bin')
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将FUDAN(fastnlp)的预训练bart模型转换为bert4torch可用的权重
# 权重地址:https://github.com/fastnlp/CPT
import torch
state_dict = torch.load('F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/pytorch_model.bin')
state_dict_new = {}
for k, v in state_dict.items():
# 主要变更就是默认有514个位置,舍弃前两个位置
if 'embed_positions.weight' in k:
v = v[2:]
state_dict_new[k] = v
else:
state_dict_new[k] = v
torch.save(state_dict_new, 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin')
'''config配置
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 21128
}
'''
\ No newline at end of file
# 转换huggingface上bert-base-chinese权重
# 权重链接:https://huggingface.co/bert-base-chinese
# 由于key和框架的key没有完全对齐,主要里面用的都是Laynorm.gamma和Laynorm.beta来保存权重和偏置
# 也可使用transformer自带命令转换tf权重https://github.com/google-research/bert
# 转换命令https://huggingface.co/docs/transformers/converting_tensorflow_models
import torch
state_dict = torch.load('F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/pytorch_model.bin')
state_dict_new = {}
for k, v in state_dict.items():
if 'LayerNorm.gamma' in k:
k = k.replace('LayerNorm.gamma', 'LayerNorm.weight')
state_dict_new[k] = v
elif 'LayerNorm.beta' in k:
k = k.replace('LayerNorm.beta', 'LayerNorm.bias')
state_dict_new[k] = v
else:
state_dict_new[k] = v
torch.save(state_dict_new, 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin')
# config配置
'''
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 21128
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将清华开源的中文GPT2模型(26亿参数)
# 项目链接(tf版本):https://github.com/TsinghuaAI/CPM-Generate
# pytorch版权重下载链接:https://huggingface.co/TsinghuaAI/CPM-Generate,经过本脚本转成bert4torch适用的权重
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = f'{ckpt_dir}/bert4torch_pytorch_model.bin'
num_hidden_layers = 32
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt2'
w = torch_weights['transformer.wte.weight']
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['transformer.wpe.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['transformer.h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['transformer.h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['transformer.h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['transformer.h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['transformer.h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['transformer.h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
# layernorm_final
w = torch_weights['transformer.ln_f.weight']
new_weights[f'{prefix}.LayerNormFinal.weight'] = w
b = torch_weights['transformer.ln_f.bias']
new_weights[f'{prefix}.LayerNormFinal.bias'] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"vocab_size": 30000,
"hidden_size": 2560,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 10240,
"max_position_embeddings": 1024,
"num_attention_heads": 32,
"num_hidden_layers": 32
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# gpt2-ml
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# pytorch权重转换和下载:https://github.com/ghosthamlet/gpt2-ml-torch
# 最后经过本脚本转成bert4torch适用的权重
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = f'{ckpt_dir}/bert4torch_pytorch_model.bin'
num_hidden_layers = 48
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt2_ml'
w = torch_weights['wte.weight']
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['wpe.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
# embedding layernorm
w = torch_weights['emb_norm.weight']
new_weights[f'{prefix}.embeddings.LayerNorm.weight'] = w
b = torch_weights['emb_norm.bias']
new_weights[f'{prefix}.embeddings.LayerNorm.bias'] = b
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将CDial-GPT的pytorch权重转换为bert4torch可适配的权重,base和large都可转换
# 项目链接(torch版本):https://github.com/thu-coai/CDial-GPT
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
num_hidden_layers = 12
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt'
# CDial-GPT的[CLS]是0、[PAD]是1,不符合一般习惯,所以交换一下
w = torch_weights['transformer.tokens_embed.weight']
w = torch.cat([w[1:2], w[:1], w[2:]], axis=0)
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['transformer.positions_embed.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['transformer.h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['transformer.h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['transformer.h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['transformer.h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['transformer.h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['transformer.h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 513,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 13088,
"type_vocab_size": 3,
"shared_segment_embeddings": true
}
'''
\ No newline at end of file
# NEZHA模型做闲聊任务,苏神已经finetune好的权重,注意不是预训练模型
# 源项目:https://github.com/bojone/nezha_gpt_dialog
import torch
import tensorflow as tf
tf_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/model.ckpt'
torch_state_dict = {}
prefix = 'bert'
mapping = {
'bert/embeddings/word_embeddings': f'{prefix}.embeddings.word_embeddings.weight',
'bert/embeddings/token_type_embeddings': f'{prefix}.embeddings.token_type_embeddings.weight',
'bert/embeddings/LayerNorm/beta': f'{prefix}.embeddings.LayerNorm.bias',
'bert/embeddings/LayerNorm/gamma': f'{prefix}.embeddings.LayerNorm.weight',
'cls/predictions/transform/dense/kernel': 'cls.predictions.transform.dense.weight##',
'cls/predictions/transform/dense/bias': 'cls.predictions.transform.dense.bias',
'cls/predictions/transform/LayerNorm/beta': 'cls.predictions.transform.LayerNorm.bias',
'cls/predictions/transform/LayerNorm/gamma': 'cls.predictions.transform.LayerNorm.weight',
'cls/predictions/output_bias': 'cls.predictions.bias'
}
for i in range(12):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
mapping.update({
f'bert/encoder/layer_{i}/attention/self/query/kernel': prefix_i + 'attention.self.query.weight##', # 转置标识
f'bert/encoder/layer_{i}/attention/self/query/bias': prefix_i + 'attention.self.query.bias',
f'bert/encoder/layer_{i}/attention/self/key/kernel': prefix_i + 'attention.self.key.weight##',
f'bert/encoder/layer_{i}/attention/self/key/bias': prefix_i + 'attention.self.key.bias',
f'bert/encoder/layer_{i}/attention/self/value/kernel': prefix_i + 'attention.self.value.weight##',
f'bert/encoder/layer_{i}/attention/self/value/bias': prefix_i + 'attention.self.value.bias',
f'bert/encoder/layer_{i}/attention/output/dense/kernel': prefix_i + 'attention.output.dense.weight##',
f'bert/encoder/layer_{i}/attention/output/dense/bias': prefix_i + 'attention.output.dense.bias',
f'bert/encoder/layer_{i}/attention/output/LayerNorm/beta': prefix_i + 'attention.output.LayerNorm.bias',
f'bert/encoder/layer_{i}/attention/output/LayerNorm/gamma': prefix_i + 'attention.output.LayerNorm.weight',
f'bert/encoder/layer_{i}/intermediate/dense/kernel': prefix_i + 'intermediate.dense.weight##',
f'bert/encoder/layer_{i}/intermediate/dense/bias': prefix_i + 'intermediate.dense.bias',
f'bert/encoder/layer_{i}/output/dense/kernel': prefix_i + 'output.dense.weight##',
f'bert/encoder/layer_{i}/output/dense/bias': prefix_i + 'output.dense.bias',
f'bert/encoder/layer_{i}/output/LayerNorm/beta': prefix_i + 'output.LayerNorm.bias',
f'bert/encoder/layer_{i}/output/LayerNorm/gamma': prefix_i + 'output.LayerNorm.weight'
})
for key, value in mapping.items():
ts = tf.train.load_variable(tf_path, key)
if value.endswith('##'):
value = value.replace('##', '')
torch_state_dict[value] = torch.from_numpy(ts).T
else:
torch_state_dict[value] = torch.from_numpy(ts)
torch_state_dict['cls.predictions.decoder.weight'] = torch_state_dict[f'{prefix}.embeddings.word_embeddings.weight']
torch_state_dict['cls.predictions.decoder.bias'] = torch_state_dict['cls.predictions.bias']
torch.save(torch_state_dict, 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin')
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"max_relative_position": 64,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 14195,
"use_relative_position": true
}
'''
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 只是转换苏神已经train好的模型,注意不是预训练模型
import numpy as np
import h5py
import torch
# 这里用的keras==2.3.1
from keras.engine import saving
tf_path = 'E:/Github/bert4keras/examples/best_model_chess.weights'
torch_state_dict = {}
# 1表示transpose, 0表示不变
key_map = {
'Embedding-Token/embeddings:0': ['embeddings.word_embeddings.weight', 0],
'Embedding-Segment/embeddings:0': ['embeddings.segment_embeddings.weight', 0],
'Embedding-Position/embeddings:0': ['embeddings.position_embeddings.weight', 0],
'Embedding-Norm/gamma:0': ['embeddings.layerNorm.weight', 0],
'Embedding-Norm/beta:0': ['embeddings.layerNorm.bias', 0],
'MLM-Dense/kernel:0': ['mlmDense.weight', 1],
'MLM-Dense/bias:0': ['mlmDense.bias', 0],
'MLM-Norm/gamma:0': ['mlmLayerNorm.weight', 0],
'MLM-Norm/beta:0': ['mlmLayerNorm.bias', 0],
'MLM-Bias/bias:0': ['mlmBias', 0],
}
for i in range(12):
key_map.update({
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+1}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.q.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+1}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.q.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+2}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.k.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+2}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.k.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+3}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.v.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+3}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.v.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+4}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.o.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+4}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.o.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention-Norm/gamma:0': [f'encoderLayer.{i}.layerNorm1.weight', 0],
f'Transformer-{i}-MultiHeadSelfAttention-Norm/beta:0': [f'encoderLayer.{i}.layerNorm1.bias', 0],
f'Transformer-{i}-FeedForward/dense_{i*6+5}/kernel:0': [f'encoderLayer.{i}.feedForward.intermediateDense.weight', 1],
f'Transformer-{i}-FeedForward/dense_{i*6+5}/bias:0': [f'encoderLayer.{i}.feedForward.intermediateDense.bias', 0],
f'Transformer-{i}-FeedForward/dense_{i*6+6}/kernel:0': [f'encoderLayer.{i}.feedForward.outputDense.weight', 1],
f'Transformer-{i}-FeedForward/dense_{i*6+6}/bias:0': [f'encoderLayer.{i}.feedForward.outputDense.bias', 0],
f'Transformer-{i}-FeedForward-Norm/gamma:0': [f'encoderLayer.{i}.layerNorm2.weight', 0],
f'Transformer-{i}-FeedForward-Norm/beta:0': [f'encoderLayer.{i}.layerNorm2.bias', 0],
})
consume_keys = set()
with h5py.File(tf_path, mode='r') as f:
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
layer_names = saving.load_attributes_from_hdf5_group(f, 'layer_names')
weight_value_tuples = []
for k, name in enumerate(layer_names):
g = f[name]
weight_names = saving.load_attributes_from_hdf5_group(g, 'weight_names')
weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
for i, weight_name in enumerate(weight_names):
new_key = key_map[weight_name][0]
if key_map[weight_name][1] == 1: # transpose
torch_state_dict[new_key] = torch.from_numpy(weight_values[i]).T
else:
torch_state_dict[new_key] = torch.from_numpy(weight_values[i])
assert new_key not in consume_keys, 'duplicate keys'
consume_keys.add(new_key)
if hasattr(f, 'close'):
f.close()
elif hasattr(f.file, 'close'):
f.file.close()
torch_state_dict['mlmDecoder.weight'] = torch_state_dict['embeddings.word_embeddings.weight']
torch_state_dict['mlmDecoder.bias'] = torch_state_dict['mlmBias']
# for k, v in torch_state_dict.items():
# print(k, v.shape)
torch.save(torch_state_dict, 'E:/Github/bert4torch/examples/others/best_model_chess.pt')
# t5_pegasus从tf转为bert4torch适配的pytorch版本
# 权重链接:https://github.com/ZhuiyiTechnology/t5-pegasus
import torch
import tensorflow as tf
import json
# small
tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_small]--chinese_t5_pegasus_small/'
torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/pytorch_model.bin'
# base:
# tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_base]--chinese_t5_pegasus_base/'
# torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/pytorch_model.bin'
tf_path = tf_dir + 'model.ckpt'
with open(tf_dir + 'config.json', 'r', encoding='utf-8') as f:
config = json.load(f)
num_layers = config['num_hidden_layers']
torch_state_dict = {}
mapping = {
'shared/embedding': 'shared.weight',
'encoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T', # 自定义标记,##T结尾表示要转置
'encoder/rms_norm/scale': 'encoder.final_layer_norm.weight',
'decoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T',
'decoder/rms_norm/scale': 'decoder.final_layer_norm.weight',
'decoder/logits/kernel': 'lm_head.weight##T'
}
for i in range(num_layers):
i1 = str(i).rjust(3, '0')
mapping.update({
f'encoder/block_{i1}/layer_000/SelfAttention/q': f'encoder.block.{i}.layer.0.SelfAttention.q.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/k': f'encoder.block.{i}.layer.0.SelfAttention.k.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/v': f'encoder.block.{i}.layer.0.SelfAttention.v.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/o': f'encoder.block.{i}.layer.0.SelfAttention.o.weight##T',
f'encoder/block_{i1}/layer_000/rms_norm/scale': f'encoder.block.{i}.layer.0.layer_norm.weight',
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_0/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight##T',
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_1/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight##T',
f'encoder/block_{i1}/layer_001/DenseReluDense/wo/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wo.weight##T',
f'encoder/block_{i1}/layer_001/rms_norm/scale': f'encoder.block.{i}.layer.1.layer_norm.weight',
f'decoder/block_{i1}/layer_000/SelfAttention/q': f'decoder.block.{i}.layer.0.SelfAttention.q.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/k': f'decoder.block.{i}.layer.0.SelfAttention.k.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/v': f'decoder.block.{i}.layer.0.SelfAttention.v.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/o': f'decoder.block.{i}.layer.0.SelfAttention.o.weight##T',
f'decoder/block_{i1}/layer_000/rms_norm/scale': f'decoder.block.{i}.layer.0.layer_norm.weight',
f'decoder/block_{i1}/layer_001/EncDecAttention/q': f'decoder.block.{i}.layer.1.EncDecAttention.q.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/k': f'decoder.block.{i}.layer.1.EncDecAttention.k.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/v': f'decoder.block.{i}.layer.1.EncDecAttention.v.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/o': f'decoder.block.{i}.layer.1.EncDecAttention.o.weight##T',
f'decoder/block_{i1}/layer_001/rms_norm/scale': f'decoder.block.{i}.layer.1.layer_norm.weight',
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_0/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight##T',
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_1/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight##T',
f'decoder/block_{i1}/layer_002/DenseReluDense/wo/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wo.weight##T',
f'decoder/block_{i1}/layer_002/rms_norm/scale': f'decoder.block.{i}.layer.2.layer_norm.weight',
})
transpose_layers = ['']
for k, v in mapping.items():
ts = torch.from_numpy(tf.train.load_variable(tf_path, k))
# if len(ts.shape)==2 and ts.shape[0] == ts.shape[1]:
# print(k, v)
if v.endswith('##T'):
torch_state_dict[v.rstrip('##T')] = ts.T
else:
torch_state_dict[v] = ts
torch.save(torch_state_dict, torch_path)
# config文件
'''
# base版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"num_attention_heads": 12,
"attention_head_size": 64,
"num_hidden_layers": 12,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
# small版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"num_attention_heads": 6,
"attention_head_size": 64,
"num_hidden_layers": 8,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
'''
\ No newline at end of file
# 权重链接:https://huggingface.co/transfo-xl-wt103
# 该项目是英文的:只用于bert4torch中transformer_xl的调试模型结构,并未实际用于finetune
import torch
ckpt_file = 'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/pytorch_model.bin'
torch_state_dict = {}
# 1表示transpose, 0表示不变
key_map = {
'transformer.word_emb.emb_layers.0.weight': 'embeddings.emb_layers.0.weight',
'transformer.word_emb.emb_layers.1.weight': 'embeddings.emb_layers.1.weight',
'transformer.word_emb.emb_layers.2.weight': 'embeddings.emb_layers.2.weight',
'transformer.word_emb.emb_layers.3.weight': 'embeddings.emb_layers.3.weight',
'transformer.word_emb.emb_projs.0': 'embeddings.emb_projs.0',
'transformer.word_emb.emb_projs.1': 'embeddings.emb_projs.1',
'transformer.word_emb.emb_projs.2': 'embeddings.emb_projs.2',
'transformer.word_emb.emb_projs.3': 'embeddings.emb_projs.3',
}
for i in range(18):
key_map.update({
f'transformer.layers.{i}.dec_attn.r_r_bias': f'encoderLayer.{i}.multiHeadAttention.r_r_bias',
f'transformer.layers.{i}.dec_attn.r_w_bias': f'encoderLayer.{i}.multiHeadAttention.r_w_bias',
f'transformer.layers.{i}.dec_attn.o_net.weight': f'encoderLayer.{i}.multiHeadAttention.o.weight',
f'transformer.layers.{i}.dec_attn.layer_norm.weight': f'encoderLayer.{i}.layerNorm1.weight',
f'transformer.layers.{i}.dec_attn.layer_norm.bias': f'encoderLayer.{i}.layerNorm1.bias',
f'transformer.layers.{i}.dec_attn.r_net.weight': f'encoderLayer.{i}.multiHeadAttention.r.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.0.weight': f'encoderLayer.{i}.feedForward.intermediateDense.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.0.bias': f'encoderLayer.{i}.feedForward.intermediateDense.bias',
f'transformer.layers.{i}.pos_ff.CoreNet.3.weight': f'encoderLayer.{i}.feedForward.outputDense.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.3.bias': f'encoderLayer.{i}.feedForward.outputDense.bias',
f'transformer.layers.{i}.pos_ff.layer_norm.weight': f'encoderLayer.{i}.layerNorm2.weight',
f'transformer.layers.{i}.pos_ff.layer_norm.bias': f'encoderLayer.{i}.layerNorm2.bias',
})
torch_weights = torch.load(ckpt_file)
model_new = {}
for key, value in key_map.items():
model_new[value] = torch_weights[key]
for i in range(18):
qkv_net = torch_weights[f'transformer.layers.{i}.dec_attn.qkv_net.weight']
model_new[f'encoderLayer.{i}.multiHeadAttention.q.weight'], model_new[f'encoderLayer.{i}.multiHeadAttention.k.weight'], model_new[f'encoderLayer.{i}.multiHeadAttention.v.weight'] = qkv_net.chunk(3, dim=0)
torch.save(model_new, 'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/bert4torch_pytorch_model.bin')
# config文件
'''
{
"adaptive": true,
"architectures": [
"TransfoXLLMHeadModel"
],
"attn_type": 0,
"clamp_len": 1000,
"cutoffs": [
20000,
40000,
200000
],
"d_embed": 1024,
"d_head": 64,
"intermediate_size": 4096,
"hidden_size": 1024,
"div_val": 4,
"is_dropout": true,
"adaptive_embedding": true,
"attention_probs_dropout_prob": 0.0,
"hidden_dropout_prob": 0.1,
"hidden_act": "relu",
"eos_token_id": 0,
"ext_len": 0,
"init": "normal",
"init_range": 0.01,
"init_std": 0.02,
"layer_norm_epsilon": 1e-05,
"mem_len": 1600,
"model_type": "transfo-xl",
"num_attention_heads": 16,
"num_hidden_layers": 18,
"pre_lnorm": false,
"proj_init_std": 0.01,
"same_length": true,
"sample_softmax": -1,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 250
}
},
"tgt_len": 128,
"tie_projs": [
false,
true,
true,
true
],
"tie_weight": true,
"untie_r": true,
"vocab_size": 267735
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# bert做conditional language model任务
# 按类随机生成文本,这个demo的类别是情感极性(正/负)
# 请参考:https://kexue.fm/archives/7124
from pydantic import NoneStrBytes
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate, Callback, AutoRegressiveDecoder, ListDataset
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
# 模型配置
maxlen = 128
batch_size = 16
num_classes = 2
epochs = 20
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
# if len(D) >= 100:
# break
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(label)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids, batch_labels], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data',
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data',
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
c = nn.Embedding(num_classes, 128)
self.bert = build_transformer_model(config_path,
checkpoint_path,
with_mlm=True,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
layer_norm_cond=c,
ignore_invalid_weights=True) # 忽略未初始化的权重
def forward(self, inputs):
_, seq_output = self.bert(inputs) # [btz, seq_len, vocab_size]
return seq_output
model = Model().to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, input, target):
input = input[:, :-1, :].reshape(-1, input.shape[-1])
target = target[:, 1:].flatten()
return super().forward(input, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class RandomSentiment(AutoRegressiveDecoder):
"""根据情感标签(0:负,1:正)随机生成一批句子
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = output_ids
segment_ids = torch.zeros_like(token_ids, device=device)
label = inputs[0]
return model.predict([token_ids, segment_ids, label])[:, -1, :]
def generate(self, label, n=1, topp=0.95):
results = self.random_sample([[label]], n, topp=topp) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in results]
random_sentiment = RandomSentiment(
start_id=tokenizer._token_start_id,
end_id=tokenizer._token_end_id,
maxlen=maxlen,
device=device
)
def just_show():
print(u'正面采样:')
print(random_sentiment.generate(1, 5, 0.95), '\n')
print(u'负面采样:')
print(random_sentiment.generate(0, 5, 0.95), '\n')
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
"""
正面采样:
[
u'外观时尚、漂亮、性价比高。',
u'外观漂亮,配置均衡,比较满意,性价比高,外观漂亮,性能较高。',
u'我是在大学的时候看到这本书的,所以一直在买。书中的作者是林静蕾,她用自己的口吻写出了一个孩子成长中的心路历程,让我看到了她们成长中的不同之处,以及她们成长过程中的不同境界。让我很欣赏!',
u'我想这是一本能够告诉读者什么是坏的,而不是教你怎样说话,告诉我什么是错。这里我推荐了《我要讲故事》,这本书是我很喜欢的一本书,我认为它的理由很多,但是,我相信我。如果你从中得到一些改进,或者你已经有了一个明智的决定。',
u'我们一家五口住的是标间,大床房,大床的床很舒服;而我们在携程网上订了两套大床房,这个酒店的价格还是比较合理的;但是房间的隔音效果不太理想,有点响的声音;酒店门口的地铁在施工中,不方便;但是酒店的门口的出租车不知道是哪个车的,打车不是很方便;酒店外面的停'
]
负面采样:
[
u'不知道是不是因为电池不太好,不是我不喜欢。',
u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
u'1、外壳不容易沾手印,不容易洗洗2、屏幕有点旧, 不能下载铃声',
u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款,为什么订单下了两周多至今还未到货?是收货时间太快了,可能就这么过去了吧?',
u'这本书我是在网上先看了一遍,后来我再看了一遍。感觉作者的文笔实在太烂了,特别是在写他的博客时特别别扭,写得很不专业,特别是他写股票时那个情绪调节的小男孩,简直就是自作聪明的样子,简直就是自作聪明的一种表现!'
]
"""
#! -*- coding:utf-8 -*-
# 文本分类例子下的模型压缩
# 方法为BERT-of-Theseus
# 论文:https://arxiv.org/abs/2002.02925
# 博客:https://kexue.fm/archives/7575
import json
from bert4torch.models import build_transformer_model, BaseModel, BERT
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.tokenizers import Tokenizer
from bert4torch.layers import BertLayer
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torchinfo import summary
import copy
from torch.distributions.bernoulli import Bernoulli
num_classes = 119
maxlen = 128
batch_size = 32
replacing_rate = 0.5
steps_for_replacing = 2000
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式: (文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for i, l in enumerate(f):
l = json.loads(l)
text, label = l['sentence'], l['label']
D.append((text, int(label)))
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/train.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/dev.json'), batch_size=batch_size, collate_fn=collate_fn)
class BERT_THESEUS(BERT):
def __init__(self, **kwargs):
super().__init__(**kwargs)
layer = BertLayer(self.hidden_size, self.num_attention_heads, self.dropout_rate, self.attention_probs_dropout_prob, self.intermediate_size, self.hidden_act, is_dropout=False, conditional_size=self.conditional_size)
self.encoderLayer = nn.ModuleList(nn.ModuleList([copy.deepcopy(layer) for _ in range(self.num_hidden_layers)]))
self.scc_n_layer = 6 # 蒸馏到6层
self.scc_layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(self.scc_n_layer)])
self.compress_ratio = self.num_hidden_layers // self.scc_n_layer
self.bernoulli = None
def set_replacing_rate(self, replacing_rate):
if not 0 < replacing_rate <= 1:
raise Exception('Replace rate must be in the range (0, 1]!')
self.bernoulli = Bernoulli(torch.tensor([replacing_rate]))
def apply_main_layers(self, inputs):
"""BERT的主体是基于Self-Attention的模块
顺序:Att --> Add --> LN --> FFN --> Add --> LN
"""
hidden_states, attention_mask, conditional_emb = inputs
encoded_layers = [hidden_states] # 添加embedding的输出
if self.training:
inference_layers = []
for i in range(self.scc_n_layer):
if self.bernoulli.sample() == 1: # REPLACE
inference_layers.append(self.scc_layer[i])
else: # KEEP the original
for offset in range(self.compress_ratio):
inference_layers.append(self.encoderLayer[i * self.compress_ratio + offset])
else: # inference with compressed model
inference_layers = self.scc_layer
# forward
for i, layer_module in enumerate(inference_layers):
hidden_states = layer_module(hidden_states, attention_mask, conditional_emb)
if self.output_all_encoded_layers:
encoded_layers.append(hidden_states)
if not self.output_all_encoded_layers:
encoded_layers.append(hidden_states)
return [encoded_layers, conditional_emb]
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=BERT_THESEUS)
self.dense = nn.Linear(self.bert.configs['hidden_size'], num_classes)
def forward(self, token_ids, segment_ids):
encoded_layers = self.bert([token_ids, segment_ids])
output = self.dense(encoded_layers[:, 0, :]) # 取第1个位置
return output
model = Model().to(device)
summary(model, input_data=next(iter(train_dataloader))[0])
# replacing策略
class ConstantReplacementScheduler:
def __init__(self, bert_encoder, replacing_rate, replacing_steps=None):
self.bert_encoder = bert_encoder
self.replacing_rate = replacing_rate
self.replacing_steps = replacing_steps
self.step_counter = 0
self.bert_encoder.set_replacing_rate(replacing_rate)
def step(self):
self.step_counter += 1
if self.replacing_steps is None or self.replacing_rate == 1.0:
return self.replacing_rate
else:
if self.step_counter >= self.replacing_steps:
self.bert_encoder.set_replacing_rate(1.0)
self.replacing_rate = 1.0
return self.replacing_rate
class LinearReplacementScheduler:
def __init__(self, bert_encoder, base_replacing_rate, k):
self.bert_encoder = bert_encoder
self.base_replacing_rate = base_replacing_rate
self.step_counter = 0
self.k = k
self.bert_encoder.set_replacing_rate(base_replacing_rate)
def step(self):
self.step_counter += 1
current_replacing_rate = min(self.k * self.step_counter + self.base_replacing_rate, 1.0)
self.bert_encoder.set_replacing_rate(current_replacing_rate)
return current_replacing_rate
replacing_rate_scheduler = ConstantReplacementScheduler(bert_encoder=model.bert, replacing_rate=replacing_rate, replacing_steps=steps_for_replacing)
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), scheduler=replacing_rate_scheduler,
metrics=['accuracy'])
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, steps, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(u'val_acc: %.5f, best_val_acc: %.5f\n' %(val_acc, self.best_val_acc))
def predict_to_file(in_file, out_file):
"""输出预测结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw = open(out_file, 'w')
with open(in_file) as fr:
for l in tqdm(fr):
l = json.loads(l)
text = l['sentence']
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
label = model.predict([[token_ids], [segment_ids]])[0].argmax()
l = json.dumps({'id': str(l['id']), 'label': str(label)})
fw.write(l + '\n')
fw.close()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
#! -*- coding: utf-8 -*-
# bert做language model任务,小说生成
import glob, re
from tqdm import tqdm
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, Callback, ListDataset
import torch
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
maxlen = 256
batch_size = 8
epochs = 10000
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
novels = []
for txt in glob.glob(filenames):
txt = open(txt, encoding='utf-8').read()
txt = txt.replace('\r', '').replace('\n', '')
txt = txt.replace(u'整理制作,并提供下载', '')
txt = re.sub(u'www.*?com', '', txt)
txt = txt.replace(u'\u3000', ' ')
sents = []
for t in txt.split(' '):
for s in re.findall(u'.*?。', t):
if len(s) <= maxlen - 2:
sents.append(s)
novels.append(sents)
data = []
pbar = tqdm(desc=u'构建语料中', total=sum(len(n) for n in novels))
for novel in novels:
s = u''
for i in range(len(novel)):
for j in range(len(novel) - i):
if len(s) + len(novel[i + j]) > maxlen - 2:
data.append(s)
s = u''
break
else:
s += novel[i + j]
pbar.update(1)
if i + j >= len(novel):
break
if s:
data.append(s)
pbar.close()
return data
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for text in batch:
token_ids, segment_ids = tokenizer.encode(text)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/pretrain/金庸小说/*.txt'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建模
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
_, mlm_scores = outputs
mlm_scores = mlm_scores[:, :-1, :].reshape(-1, mlm_scores.shape[-1])
target = target[:, 1:].flatten()
return super().forward(mlm_scores, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
# 随机采样
class StoryCompletion(AutoRegressiveDecoder):
"""基于随机采样的故事续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = inputs[0]
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.zeros_like(token_ids, device=device)
_, mlm_scores = model.predict([token_ids, segment_ids])
return mlm_scores[:, -1, :]
def generate(self, text, n=1, topp=0.95):
token_ids, _ = tokenizer.encode(text)
results = self.random_sample([token_ids[:-1]], n, topp=topp) # 基于随机采样
return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
story_completion = StoryCompletion(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def just_show():
s1 = u'当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。'
s2 = u'虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。'
s3 = u'杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。'
for s in [s1, s2, s3]:
t = story_completion.generate(s)
print(u'输入: %s' % s)
print(u'结果: %s\n' % ('\n'.join(t)))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=100, callbacks=[evaluator])
else:
model.load_weights('./best_model.weights')
"""
效果:
输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。
结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。次日清晨,张无忌便和赵敏去买了一匹高头大马,自己骑了随伴。那马甚有神骏,三十六斤重的身躯之中,竟无一头白马。他心中怦怦乱跳,暗想:若能将赵敏引出迷城,我决不致再和她相会,但若和赵姑娘相遇,我一生一世决计再难相见。何况我是她的私生女儿,这般亲热,岂不是好?我如何能和她相见?今后我要教训教训她才好?我教教她,教训她,要她心里快快活活的。他心如刀割,当即回到客店,将张无忌的所在说了。
输入: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。
结果: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。那矮子见他如此功力,大吃一惊,叫道:什么人?是谁?你干什么?我师父是谁?你们是谁?是谁?你们是谁?我师父是谁?你这矮子,便是段延庆。你们不知道我师父便是,是不是?快快说来。那矮子道:我师父便是延庆太子,他的徒弟也是段延庆。他老人家在唐朝做镇南王,你们便将他改名为延庆太子,叫做延庆太子!这名头倒怪,你们大伙儿听见了,也不知道他老人家是死是活。
输入: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。
结果: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。这时见他手中所握,竟是一柄特制的短剑,心中大喜,叫道::原来是金蛇郎君的剑!原来你便是金蛇郎君的弟子,这一下可要叫我失望了。那人哈哈一笑,说道:好啊!好啊,好啊!我的金蛇剑是我的,不过我是你的。这人道:我姓杨名过,名字叫过。你是我儿子,是我女儿,是不是?你这么大的年纪,怎地自称金刀驸马?我这就给你取个名字,叫作过儿。
"""
#! -*- coding: utf-8 -*-
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 数据:https://github.com/bojone/gpt_cchess
# 模型训练可以在python2/python3进行。但是cchess模块只支持python3,
# 因此如果需要交互式体验模型棋力,那么需要在python3下进行。
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_roberta_chess.py
import json
import numpy as np
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from bert4torch.snippets import sequence_padding, ListDataset, Callback
from cchess import *
# 基本信息
maxlen = 512
steps_per_epoch = 1000
epochs = 10000
batch_size = 16
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取全局棋谱
返回:[(棋谱, 结果)],其中结果等于2为红方赢棋,1为和棋,
0为黑方赢棋,-1则为无明确标注胜负。
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if not l['fen']:
result = int(l['items'].get(u'棋局结果', -1))
D.append((l['iccs'], result))
return D
# 建立分词器
chars = [u'[PAD]'] + list(u'0123456789abcdefghi')
token_dict = dict(zip(chars, range(len(chars))))
tokenizer = Tokenizer(token_dict)
tokenizer._token_unk_id = 0
bert_token_dict = load_vocab(dict_path)
keep_tokens = [bert_token_dict[c] for c in chars]
count = 0
def get_count():
if count < 20000:
n = 8
elif count < 40000:
n = 4
elif count < 80000:
n = 2
else:
n = 1
return n
def collate_fn(batch):
"""数据生成器
"""
batch_token_ids, batch_segment_ids = [], []
for text, _ in batch:
token_ids, segment_ids = tokenizer.encode(' '.join(text), maxlen=maxlen // get_count() + 1)
batch_token_ids.append([0] + token_ids[1:-1])
batch_segment_ids.append([0] + segment_ids[1:-1])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
global count
count += 1
return [batch_token_ids, batch_segment_ids], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/qipu/qipu.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 由于字典中0不代表padding位,为避免attention_mask计算错误,这里token_pad_ids=-100
model = build_transformer_model(config_path, checkpoint_path, application='lm', with_mlm=True,
keep_tokens=keep_tokens, token_pad_ids=-100).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
_, mlm_scores = outputs
mlm_scores = mlm_scores[:, :-1, :].reshape(-1, mlm_scores.shape[-1])
target = target[:, 1:].flatten()
return super().forward(mlm_scores, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class ChessPlayer(object):
"""交互式下棋程序
"""
def move_to_chinese(self, move):
"""将单步走法转为中文描述
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_chinese()
def move_to_iccs(self, move):
"""将单步走法转为iccs表示
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_iccs()
def print_board(self):
"""打印当前棋盘
直观起见,红方用红色表示,黑方用绿色表示。
"""
for l in self.board.dump_board():
for c in u'兵炮车马相仕帅':
l = l.replace(c, u'\033[1;31;40m%s\033[0m' % c)
for c in u'卒砲砗碼象士将':
l = l.replace(c, u'\033[1;32;40m%s\033[0m' % c)
print(l)
def movable_steps(self):
"""给出当前局面所有候选走法
"""
return [self.move_to_iccs(m) for m in self.board.create_moves()]
def human_input(self):
"""人类行棋
"""
while True:
try:
iccs = input(u'请输入iccs棋着: ')
print(iccs)
move = self.board.move_iccs(iccs)
if move is not None:
return iccs, move
except KeyboardInterrupt:
return None
except:
pass
def record(self, iccs):
"""将局面往前推进一步
"""
self.history += iccs
self.board.next_turn()
self.print_board()
self.current = (self.current + 1) % 2
def new_game(self, current=0):
"""开新局
"""
self.board = ChessBoard()
self.board.from_fen(FULL_INIT_FEN)
self.print_board()
self.history = ''
self.current = current
if self.current == 0: # 人类先手
iccs, move = self.human_input()
self.record(iccs)
while True:
# 机器走棋
moves = self.movable_steps()
iccses = [' '.join(self.history + m) for m in moves]
token_ids = [[0] + tokenizer.encode(ic)[0][1:-1] for ic in iccses]
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)
segment_ids = torch.zeros_like(token_ids)
preds = model.predict([token_ids, segment_ids])[-1][:, -5:-1]
preds = nn.Softmax(dim=-1)(preds)
preds = torch.take_along_dim(preds, token_ids[:, -4:, None], dim=2)
preds = torch.log(preds + 1e-8)[:, :, 0].sum(dim=1)
iccs = moves[preds.argmax()]
move = self.board.move_iccs(iccs)
self.record(iccs)
if self.board.is_win():
print(u'机器赢了')
break
# 人类走棋
iccs, move = self.human_input()
self.record(iccs)
if self.board.is_win():
print(u'人类赢了')
break
chessplayer = ChessPlayer()
class Evaluator(Callback):
"""评估与保存
"""
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存模型
# model.save_weights('./best_model_chess.pt')
pass
if __name__ == '__main__':
choice = 'eval'
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=1000, epochs=20, callbacks=[evaluator])
else:
model.load_weights('./best_model_chess.pt')
chessplayer.new_game(0) # 启动新棋局,0为人类先手,1为机器先手
#! -*- coding: utf-8 -*-
# 追一科技2019年NL2SQL挑战赛的一个Baseline(个人作品,非官方发布,基于Bert)
# 比赛地址:https://tianchi.aliyun.com/competition/entrance/231716/introduction
# 科学空间:https://kexue.fm/archives/6771
# 苏神结果是58%左右,我复现出来58.39%
# 思路:[CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
# 整句的[CLS]用来做conds连接符判断: {0:"", 1:"and", 2:"or"}
# col的[CLS]用来预测该列是否被select+agg聚合判断: {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
''' 单条样本示例
{
"table_id": "a1b2c3d4", # 相应表格的id
"question": "世茂茂悦府新盘容积率大于1,请问它的套均面积是多少?", # 自然语言问句
"sql":{ # 真实SQL
"sel": [7], # SQL选择的列
"agg": [0], # 选择的列相应的聚合函数, '0'代表无
"cond_conn_op": 0, # 条件之间的关系
"conds": [
[1, 2, "世茂茂悦府"], # 条件列, 条件类型, 条件值,col_1 == "世茂茂悦府"
[6, 0, "1"]
]
}
}
'''
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback
from bert4torch.optimizers import get_linear_schedule_with_warmup
import json
import codecs
import numpy as np
from tqdm import tqdm
import jieba
import editdistance
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim
import re
batch_size = 16
maxlen = 160
num_agg = 7 # agg_sql_dict = {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
num_op = 5 # {0:">", 1:"<", 2:"==", 3:"!=", 4:"不被select"}
num_cond_conn_op = 3 # conn_sql_dict = {0:"", 1:"and", 2:"or"}
learning_rate = 2.5e-5
epochs = 15
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
def read_data(data_file, table_file):
data, tables = [], {}
with open(data_file, 'r', encoding='utf-8') as f:
for l in f:
data.append(json.loads(l))
with open(table_file, 'r', encoding='utf-8') as f:
for l in f:
l = json.loads(l)
d = {}
d['headers'] = l['header']
d['header2id'] = {j: i for i, j in enumerate(d['headers'])}
d['content'] = {}
d['all_values'] = set()
rows = np.array(l['rows'])
for i, h in enumerate(d['headers']):
d['content'][h] = set(rows[:, i])
d['all_values'].update(d['content'][h])
d['all_values'] = set([i for i in d['all_values'] if hasattr(i, '__len__')])
tables[l['id']] = d
return data, tables
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
tokenizer = OurTokenizer(token_dict)
def most_similar(s, slist):
"""从词表中找最相近的词(当无法全匹配的时候)
"""
if len(slist) == 0:
return s
scores = [editdistance.eval(s, t) for t in slist]
return slist[np.argmin(scores)]
def most_similar_2(w, s):
"""从句子s中找与w最相近的片段,
借助分词工具和ngram的方式尽量精确地确定边界。
"""
sw = jieba.lcut(s)
sl = list(sw)
sl.extend([''.join(i) for i in zip(sw, sw[1:])])
sl.extend([''.join(i) for i in zip(sw, sw[1:], sw[2:])])
return most_similar(w, sl)
class MyDataset(Dataset):
def __init__(self, data, tables):
self.data = data
self.tables = tables
def __len__(self):
return len(self.data)
def __getitem__(self, i):
d = self.data[i]
# [CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
x1 = tokenizer.encode(d['question'])[0]
xm = [0] + [1] * len(d['question']) + [0]
h = []
for j in self.tables[d['table_id']]['headers']:
_x1 = tokenizer.encode(j)[0]
h.append(len(x1))
x1.extend(_x1)
if len(x1) > maxlen:
return
hm = [1] * len(h) # 列的mask
# 列是否被选择
sel = []
for j in range(len(h)):
if j in d['sql']['sel']:
j = d['sql']['sel'].index(j)
sel.append(d['sql']['agg'][j])
else:
sel.append(num_agg - 1) # 不被select则被标记为num_agg-1
conn = [d['sql']['cond_conn_op']]
csel = np.zeros(len(d['question']) + 2, dtype='int32') # 这里的0既表示padding,又表示第一列,padding部分训练时会被mask
cop = np.zeros(len(d['question']) + 2, dtype='int32') + num_op - 1 # 不被select则被标记为num_op-1
for j in d['sql']['conds']:
if j[2] not in d['question']:
j[2] = most_similar_2(j[2], d['question'])
if j[2] not in d['question']:
continue
k = d['question'].index(j[2])
csel[k + 1: k + 1 + len(j[2])] = j[0]
cop[k + 1: k + 1 + len(j[2])] = j[1]
# x1: bert的输入 [101, 123, 121, 122, 123, 2399, 122, 118, 126, 3299, 5168, 6369, 2832, 6598, ...]
# xm: bert输入mask [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]
# h: 列名[CLS]所在位置 [56, 60, 74, 89, 104, 114, 123, 132]
# hm: 列名mask [1, 1, 1, 1, 1, 1, 1, 1]
# sel: 被select查找的列 [4, 6, 6, 6, 6, 6, 6, 6], 6表示列未被select,4表示COUNT
# conn: 连接类型 [1], 1表示and
# csel: 条件中的列 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# cop: 条件中的运算符(同时也是值的标记) [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
return x1, xm, h, hm, sel, conn, csel, cop
def collate_fn(batch):
x1, xm, h, hm, sel, conn, csel, cop = zip(*[i for i in batch if i])
x1 = torch.tensor(sequence_padding(x1), dtype=torch.long, device=device)
xm = torch.tensor(sequence_padding(xm, length=x1.shape[1]), dtype=torch.long, device=device)
h = torch.tensor(sequence_padding(h), dtype=torch.long, device=device)
hm = torch.tensor(sequence_padding(hm), dtype=torch.long, device=device)
sel = torch.tensor(sequence_padding(sel), dtype=torch.long, device=device)
conn = torch.tensor(sequence_padding(conn), dtype=torch.long, device=device)
csel = torch.tensor(sequence_padding(csel, length=x1.shape[1]), dtype=torch.long, device=device)
cop = torch.tensor(sequence_padding(cop, length=x1.shape[1]), dtype=torch.long, device=device)
return [x1, h, hm], [sel, conn, csel, cop, xm, hm]
datadir = 'F:/Projects/data/corpus/other/ZhuiyiTechnology_NL2SQL'
train_dataloader = DataLoader(MyDataset(*read_data(f'{datadir}/train/train.json', f'{datadir}/train/train.tables.json')),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_data, valid_table = read_data(f'{datadir}/val/val.json', f'{datadir}/val/val.tables.json')
test_data, test_table = read_data(f'{datadir}/test/test.json', f'{datadir}/test/test.tables.json')
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
hidden_size = self.bert.configs['hidden_size']
self.conn = nn.Linear(hidden_size, num_cond_conn_op)
self.agg = nn.Linear(hidden_size, num_agg)
self.op = nn.Linear(hidden_size, num_op)
self.dense1 = nn.Linear(hidden_size, 256)
self.dense2 = nn.Linear(hidden_size, 256)
self.dense3 = nn.Linear(256, 1)
def forward(self, x1_in, h, hm):
x = self.bert([x1_in])
# cls判断条件连接符 {0:"", 1:"and", 2:"or"}
x4conn = x[:, 0] # [cls位]
pconn = self.conn(x4conn) # [btz, num_cond_conn_op]
# 列的cls位用来判断列名的agg和是否被select {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
x4h = torch.gather(x, dim=1, index=h.unsqueeze(-1).expand(-1, -1, 768)) # [btz, col_len, hdsz]
psel = self.agg(x4h) # [btz, col_len, num_agg]
# 序列标注conds的值和运算符
pcop = self.op(x) # [btz, seq_len, num_op]
x = x.unsqueeze(2) # [btz, seq_len, 1, hdsz]
x4h = x4h.unsqueeze(1) # [btz, 1, col_len, hdsz]
pcsel_1 = self.dense1(x) # [btz, seq_len, 1, 256]
pcsel_2 = self.dense2(x4h) # [btz, 1, col_len, 256]
pcsel = pcsel_1 + pcsel_2
pcsel = torch.tanh(pcsel)
pcsel = self.dense3(pcsel) # [btz, seq_len, col_len, 1]
pcsel = pcsel[..., 0] - (1 - hm[:, None]) * 1e10 # [btz, seq_len, col_len]
return pconn, psel, pcop, pcsel
model = Model().to(device)
class MyLoss(nn.Module):
def forward(self, outputs, labels):
pconn, psel, pcop, pcsel = outputs
sel_in, conn_in, csel_in, cop_in, xm, hm = labels
cm = torch.not_equal(cop_in, num_op - 1)
batch_size = psel.shape[0]
psel_loss = F.cross_entropy(psel.view(-1, num_agg), sel_in.view(-1), reduction='none').reshape(batch_size, -1)
psel_loss = torch.sum(psel_loss * hm) / torch.sum(hm)
pconn_loss = F.cross_entropy(pconn, conn_in.view(-1))
pcop_loss = F.cross_entropy(pcop.view(-1, num_op), cop_in.view(-1), reduction='none').reshape(batch_size, -1)
pcop_loss = torch.sum(pcop_loss * xm) / torch.sum(xm)
pcsel_loss = F.cross_entropy(pcsel.view(-1, pcsel.shape[-1]), csel_in.view(-1), reduction='none').reshape(batch_size, -1)
pcsel_loss = torch.sum(pcsel_loss * xm * cm) / torch.sum(xm * cm)
loss = psel_loss + pconn_loss + pcop_loss + pcsel_loss
return {'loss': loss, 'psel_loss': psel_loss, 'pconn_loss': pconn_loss, 'pcop_loss': pcop_loss, 'pcsel_loss': pcsel_loss}
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, len(train_dataloader), len(train_dataloader)*epochs)
model.compile(
loss=MyLoss(),
optimizer=optimizer,
scheduler=scheduler
)
def nl2sql(question, table):
"""输入question和headers,转SQL
"""
x1 = tokenizer.encode(question)[0]
h = []
for i in table['headers']:
_x1 = tokenizer.encode(i)[0]
h.append(len(x1))
x1.extend(_x1)
hm = [1] * len(h)
pconn, psel, pcop, pcsel = model.predict([
torch.tensor([x1], dtype=torch.long, device=device),
torch.tensor([h], dtype=torch.long, device=device),
torch.tensor([hm], dtype=torch.long, device=device)
])
pconn, psel, pcop, pcsel = pconn.cpu().numpy(), psel.cpu().numpy(), pcop.cpu().numpy(), pcsel.cpu().numpy()
R = {'agg': [], 'sel': []}
for i, j in enumerate(psel[0].argmax(1)):
if j != num_agg - 1: # num_agg-1类是不被select的意思
R['sel'].append(i)
R['agg'].append(int(j))
conds = []
v_op = -1
for i, j in enumerate(pcop[0, :len(question)+1].argmax(1)):
# 这里结合标注和分类来预测条件
if j != num_op - 1:
if v_op != j:
if v_op != -1:
v_end = v_start + len(v_str)
csel = pcsel[0][v_start: v_end].mean(0).argmax()
conds.append((csel, v_op, v_str))
v_start = i
v_op = j
v_str = question[i - 1]
else:
v_str += question[i - 1]
elif v_op != -1:
v_end = v_start + len(v_str)
csel = pcsel[0][v_start: v_end].mean(0).argmax()
conds.append((csel, v_op, v_str))
v_op = -1
R['conds'] = set()
for i, j, k in conds:
if re.findall('[^\d\.]', k):
j = 2 # 非数字只能用等号
if j == 2:
if k not in table['all_values']:
# 等号的值必须在table出现过,否则找一个最相近的
k = most_similar(k, list(table['all_values']))
h = table['headers'][i]
# 然后检查值对应的列是否正确,如果不正确,直接修正列名
if k not in table['content'][h]:
for r, v in table['content'].items():
if k in v:
i = table['header2id'][r]
break
R['conds'].add((int(i), int(j), str(k)))
R['conds'] = list(R['conds'])
if len(R['conds']) <= 1: # 条件数少于等于1时,条件连接符直接为0
R['cond_conn_op'] = 0
else:
R['cond_conn_op'] = 1 + int(pconn[0, 1:].argmax()) # 不能是0
return R
def is_equal(R1, R2):
"""判断两个SQL字典是否全匹配
"""
return (R1['cond_conn_op'] == R2['cond_conn_op']) &\
(set(zip(R1['sel'], R1['agg'])) == set(zip(R2['sel'], R2['agg']))) &\
(set([tuple(i) for i in R1['conds']]) == set([tuple(i) for i in R2['conds']]))
class Evaluate(Callback):
def __init__(self):
self.accs = []
self.best = 0.
self.passed = 0
self.stage = 0
def on_epoch_end(self, global_step, epoch, logs=None):
acc = self.evaluate(valid_data, valid_table)
self.accs.append(acc)
if acc > self.best:
self.best = acc
# model.save_weights('best_model.weights')
print('acc: %.5f, best acc: %.5f\n' % (acc, self.best))
def evaluate(self, data, tables):
right = 0.
pbar = tqdm()
F = open('evaluate_pred.json', 'w', encoding='utf-8')
for i, d in enumerate(data):
question = d['question']
table = tables[d['table_id']]
R = nl2sql(question, table)
right += float(is_equal(R, d['sql']))
pbar.update(1)
pbar.set_description('< acc: %.5f >' % (right / (i + 1)))
d['sql_pred'] = R
try:
s = json.dumps(d, ensure_ascii=False, indent=4)
except:
continue
F.write(s + '\n')
F.close()
pbar.close()
return right / len(data)
def test(self, data, tables, outfile='result.json'):
pbar = tqdm()
F = open(outfile, 'w')
for i, d in enumerate(data):
question = d['question']
table = tables[d['table_id']]
R = nl2sql(question, table)
pbar.update(1)
s = json.dumps(R, ensure_ascii=False)
F.write(s.encode('utf-8') + '\n')
F.close()
pbar.close()
if __name__ == '__main__':
evaluator = Evaluate()
model.fit(
train_dataloader,
steps_per_epoch=None,
epochs=epochs,
callbacks=[evaluator]
)
else:
model.load_weights('best_model.weights')
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 预训练脚本,单GPU版方便测试
# 改DDP需几行代码,参考https://github.com/Tongjilibo/bert4torch/blob/master/examples/training_trick/task_distributed_data_parallel.py
from bert4torch.models import build_transformer_model
from bert4torch.snippets import sequence_padding, Callback
from bert4torch.optimizers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import json
import os
import shelve
import random
import time
# 语料路径和模型保存路径
model_saved_path = './bert_model.ckpt'
dir_training_data = 'E:/Github/bert4torch/examples/datasets/pretrain' # dir_training_data
task_name = 'roberta'
# 其他配置
maxlen = 512
batch_size = 7
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin' # 如果从零训练,就设为None
learning_rate = 0.00176
weight_decay_rate = 0.01 # 权重衰减
num_warmup_steps = 3125
num_train_steps = 125000
steps_per_epoch = 10000
grad_accum_steps = 16 # 大于1即表明使用梯度累积
epochs = num_train_steps * grad_accum_steps // steps_per_epoch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 读取数据集,构建数据张量
class MyDataset(Dataset):
def __init__(self, file):
super(MyDataset, self).__init__()
self.file = file
self.len = self._get_dataset_length()
self.db = self._load_data()
def __getitem__(self, index):
return self.db[str(index)]
def __len__(self):
return self.len
def _get_dataset_length(self):
file_record_info = self.file + ".json"
record_info = json.load(open(file_record_info, "r", encoding="utf-8"))
return record_info["samples_num"]
def _load_data(self):
return shelve.open(self.file)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for item in batch:
batch_token_ids.append(item['input_ids'])
batch_labels.append(item['masked_lm_labels'])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
# 从语料文件夹中随机选取一个文件,生成dataloader
def get_train_dataloader():
while True:
# prepare dataset
files_training_data = os.listdir(dir_training_data)
files_training_data = [file.split(".")[0] for file in files_training_data if "train" in file]
# 防止使用到正在生成的文件
files_training_data = [i for i in set(files_training_data) if files_training_data.count(i)==4]
if files_training_data:
file_train = random.choice(files_training_data)
for suffix in [".bak", ".dat", ".dir", ".json"]:
file_old = os.path.join(dir_training_data, file_train + suffix)
file_new = os.path.join(dir_training_data, task_name + suffix)
os.renames(file_old, file_new)
cur_load_file = file_new.split(".")[0]
train_dataloader = DataLoader(MyDataset(cur_load_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
break
else:
print("No training data! Sleep 300s!")
time.sleep(10)
continue
return train_dataloader
train_dataloader = get_train_dataloader()
model = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0, with_mlm=True).to(device)
# weight decay
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay_rate},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, output, batch_labels):
y_preds = output[-1]
y_preds = y_preds.reshape(-1, y_preds.shape[-1])
return super().forward(y_preds, batch_labels.flatten())
# 定义使用的loss和optimizer,这里支持自定义
optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate, weight_decay=weight_decay_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
model.compile(loss=MyLoss(ignore_index=0), optimizer=optimizer, scheduler=scheduler)
class ModelCheckpoint(Callback):
"""自动保存最新模型
"""
def on_dataloader_end(self, logs=None):
# 在dataloader结束的时候,关闭db并且删除训练的文件
model.train_dataloader.dataset.db.close()
for suffix in [".bak", ".dat", ".dir", ".json"]:
file_remove = os.path.join(dir_training_data, task_name + suffix)
try:
os.remove(file_remove)
except:
print(f"Failed to remove training data {file_remove}.")
# 重新生成dataloader
model.train_dataloader = get_train_dataloader()
def on_epoch_end(self, global_step, epoch, logs=None):
model.save_weights(model_saved_path)
if __name__ == '__main__':
# 保存模型
checkpoint = ModelCheckpoint()
# 模型训练
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
grad_accumulation_steps=grad_accum_steps,
epochs=epochs,
callbacks=[checkpoint],
)
#! -*- coding: utf-8 -*-
# 预训练语料构建,这里实现的mlm任务的,NSP和SOP未使用
# 方案:一直动态生成文件,超过最大保存数目时候sleep,
# 当训练速度超过文件生成速度时候,可开启多个数据生成脚本
import numpy as np
from bert4torch.tokenizers import Tokenizer
import json, glob, re
from tqdm import tqdm
import collections
import gc
import shelve
import time
import os
import random
import jieba
jieba.initialize()
class TrainingDataset(object):
"""预训练数据集生成器
"""
def __init__(self, tokenizer, sequence_length=512):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self.tokenizer = tokenizer
self.sequence_length = sequence_length
self.token_pad_id = tokenizer._token_pad_id
self.token_cls_id = tokenizer._token_start_id
self.token_sep_id = tokenizer._token_end_id
self.token_mask_id = tokenizer._token_mask_id
self.vocab_size = tokenizer._vocab_size
def padding(self, sequence, padding_value=None):
"""对单个序列进行补0
"""
if padding_value is None:
padding_value = self.token_pad_id
sequence = sequence[:self.sequence_length]
padding_length = self.sequence_length - len(sequence)
return sequence + [padding_value] * padding_length
def sentence_process(self, text):
"""单个文本的处理函数,返回处理后的instance
"""
raise NotImplementedError
def paragraph_process(self, texts, starts, ends, paddings):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances, instance = [], [[start] for start in starts]
for text in texts:
# 处理单个句子
sub_instance = self.sentence_process(text)
sub_instance = [i[:self.sequence_length - 2] for i in sub_instance]
new_length = len(instance[0]) + len(sub_instance[0])
# 如果长度即将溢出
if new_length > self.sequence_length - 1:
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储结果,并构建新样本
instances.append(complete_instance)
instance = [[start] for start in starts]
# 样本续接
for item, sub_item in zip(instance, sub_instance):
item.extend(sub_item)
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储最后的instance
instances.append(complete_instance)
return instances
def serialize(self, instances, db, count):
"""写入到文件
"""
for instance in instances:
input_ids, masked_lm_labels = instance[0], instance[1]
assert len(input_ids) <= sequence_length
features = collections.OrderedDict()
features["input_ids"] = input_ids
features["masked_lm_labels"] = masked_lm_labels
db[str(count)] = features
count += 1
return count
def process(self, corpus, record_name):
"""处理输入语料(corpus)
"""
count = 0
db = shelve.open(record_name)
for texts in corpus:
instances = self.paragraph_process(texts)
count = self.serialize(instances, db, count)
db.close()
del instances
gc.collect()
# 记录对应的文件名和样本量
record_info = {"filename": record_name, "samples_num": count}
json.dump(record_info, open(record_name + ".json", "w", encoding="utf-8"))
print('write %s examples into %s' % (count, record_name))
class TrainingDatasetRoBERTa(TrainingDataset):
"""预训练数据集生成器(RoBERTa模式)
"""
def __init__(self, tokenizer, word_segment, mask_rate=0.15, sequence_length=512):
"""参数说明:
tokenizer必须是bert4torch自带的tokenizer类;
word_segment是任意分词函数。
"""
super(TrainingDatasetRoBERTa, self).__init__(tokenizer, sequence_length)
self.word_segment = word_segment
self.mask_rate = mask_rate
def token_process(self, token_id):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand = np.random.random()
if rand <= 0.8:
return self.token_mask_id
elif rand <= 0.9:
return token_id
else:
return np.random.randint(0, self.vocab_size)
def sentence_process(self, text):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列, 来指定哪些token是否要被mask
"""
words = self.word_segment(text)
rands = np.random.random(len(words))
token_ids, mask_ids = [], []
for rand, word in zip(rands, words):
word_tokens = self.tokenizer.tokenize(text=word)[1:-1]
word_token_ids = self.tokenizer.tokens_to_ids(word_tokens)
if rand < self.mask_rate:
word_mask_ids = [self.token_process(i) for i in word_token_ids]
token_ids.extend(word_mask_ids)
mask_ids.extend(word_token_ids)
else:
token_ids.extend(word_token_ids)
word_mask_ids = [0] * len(word_tokens)
mask_ids.extend(word_mask_ids)
return [token_ids, mask_ids]
def paragraph_process(self, texts):
"""给原方法补上starts、ends、paddings
"""
starts = [self.token_cls_id, 0]
ends = [self.token_sep_id, 0]
paddings = [self.token_pad_id, 0]
return super(TrainingDatasetRoBERTa, self).paragraph_process(texts, starts, ends, paddings)
if __name__ == '__main__':
sequence_length = 512 # 文本长度
max_file_num = 40 # 最大保存的文件个数
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt' # 字典文件
dir_training_data = 'E:/Github/bert4torch/examples/datasets/pretrain' # 保存的文件目录
dir_corpus = 'F:/Projects/data/corpus/pretrain' # 读入的语料地址
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def some_texts():
'''挑选语料
'''
files_corpus = glob.glob(f'{dir_corpus}/*/*') # 根据目录结构自行调整
file_corpus = random.choice(files_corpus) # 随机挑选一篇文章
count, texts = 0, []
with open(file_corpus, encoding='utf-8') as f:
for l in tqdm(f, desc=f'Load data from {file_corpus}'):
l = l.strip()
texts.extend(re.findall(u'.*?[\n。]+', l))
count += 1
if count == 10: # 10篇文章合在一起再处理
yield texts
count, texts = 0, []
if texts:
yield texts
def word_segment(text):
return jieba.lcut(text)
TD = TrainingDatasetRoBERTa(tokenizer, word_segment, sequence_length=sequence_length)
while True:
train_files = [file for file in os.listdir(dir_training_data) if ('train_' in file) and ('dat' in file)]
# 当保存的训练文件未达到指定数量时
if len(train_files) < max_file_num:
record_name = f'{dir_training_data}/train_'+ time.strftime('%Y%m%d%H%M%S', time.localtime())
TD.process(corpus=some_texts(), record_name=record_name)
time.sleep(1) # 可不加,这里是防止生成文件名一样
else:
time.sleep(300)
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage1,训练方式和simbert类似+[MASK预测]
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, AutoRegressiveDecoder
from bert4torch.snippets import Callback, truncate_sequences, get_pool_emb
from bert4torch.tokenizers import Tokenizer
import jieba
jieba.initialize()
# 基本信息
maxlen = 64
batch_size = 12
# bert配置,加载roformer权重
config_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 这里语料没有官方的丰富,可用自定义预料
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def masked_encode(text):
"""wwm随机mask
"""
words = jieba.lcut(text)
rands = np.random.random(len(words))
source, target = [tokenizer._token_start_id], [0]
for r, w in zip(rands, words):
ids = tokenizer.encode(w)[0][1:-1]
if r < 0.15 * 0.8:
source.extend([tokenizer._token_mask_id] * len(ids))
target.extend(ids)
elif r < 0.15 * 0.9:
source.extend(ids)
target.extend(ids)
elif r < 0.15:
source.extend(np.random.choice(tokenizer._vocab_size - 1, size=len(ids)) + 1)
target.extend(ids)
else:
source.extend(ids)
target.extend([0] * len(ids))
source = source[:maxlen - 1] + [tokenizer._token_end_id]
target = target[:maxlen - 1] + [0]
return source, target
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
for _ in range(2):
text, synonym = synonyms[:2]
if np.random.random() < 0.5:
text_ids = masked_encode(text)[0]
else:
text_ids = tokenizer.encode(text)[0]
synonym_ids = tokenizer.encode(synonym)[0][1:]
truncate_sequences(maxlen * 2, -2, text_ids, synonym_ids)
token_ids = text_ids + synonym_ids
segment_ids = [0] * len(text_ids) + [1] * len(synonym_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
text, synonym = synonym, text
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', with_mlm=True, dropout_rate=0.2, application='unilm')
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred):
y_true = self.get_labels_of_similarity(y_pred) # 构建标签
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
similarities = similarities - torch.eye(y_pred.shape[0], device=device) * 1e12 # 排除对角线
similarities = similarities * 30 # scale
loss = F.cross_entropy(similarities, y_true)
return loss
def get_labels_of_similarity(self, y_pred):
idxs = torch.arange(0, y_pred.shape[0], device=device)
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
labels = idxs_1.eq(idxs_2).float()
return labels
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage2,把simbert的相似度蒸馏到roformer-sim上
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import json
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate, get_pool_emb
from bert4torch.snippets import AutoRegressiveDecoder, Callback, truncate_sequences
from bert4torch.tokenizers import Tokenizer
import jieba
jieba.initialize()
# 基本信息
maxlen = 64
batch_size = 12
# bert配置,需要加载stage1训练后的权重,这里直接加载官方最终的权重以示例
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 这里语料和stage1保持一致
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
D.append(json.loads(l))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def masked_encode(text):
"""wwm随机mask
"""
words = jieba.lcut(text)
rands = np.random.random(len(words))
source, target = [tokenizer._token_start_id], [0]
for r, w in zip(rands, words):
ids = tokenizer.encode(w)[0][1:-1]
if r < 0.15 * 0.8:
source.extend([tokenizer._token_mask_id] * len(ids))
target.extend(ids)
elif r < 0.15 * 0.9:
source.extend(ids)
target.extend(ids)
elif r < 0.15:
source.extend(
np.random.choice(tokenizer._vocab_size - 1, size=len(ids)) + 1
)
target.extend(ids)
else:
source.extend(ids)
target.extend([0] * len(ids))
source = source[:maxlen - 1] + [tokenizer._token_end_id]
target = target[:maxlen - 1] + [0]
return source, target
# ========== 蒸馏用:开始 ==========
# simbert配置
sim_config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
sim_checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
sim_dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
# 建立分词器
sim_tokenizer = Tokenizer(sim_dict_path, do_lower_case=True) # 建立分词器
# 建立加载模型
simbert = build_transformer_model(sim_config_path, sim_checkpoint_path, with_pool='linear', application='unilm').to(device)
# ========== 蒸馏用:结束 ==========
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
batch_sim_token_ids, batch_sim_segment_ids = [], []
for d in batch:
text, synonyms = d['text'], d['synonyms']
synonyms = [text] + synonyms
np.random.shuffle(synonyms)
for _ in range(2):
text, synonym = synonyms[:2]
if np.random.random() < 0.5:
text_ids = masked_encode(text)[0]
else:
text_ids = tokenizer.encode(text)[0]
synonym_ids = tokenizer.encode(synonym)[0][1:]
truncate_sequences(maxlen * 2, -2, text_ids, synonym_ids)
token_ids = text_ids + synonym_ids
segment_ids = [0] * len(text_ids) + [1] * len(synonym_ids)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
# ==== 蒸馏用:开始 ====
token_ids, segment_ids = sim_tokenizer.encode(text, maxlen=maxlen)
batch_sim_token_ids.append(token_ids)
batch_sim_segment_ids.append(segment_ids)
# ==== 蒸馏用:结束 ====
text, synonym = synonym, text
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
# ==== 蒸馏用:开始 ====
batch_sim_token_ids = torch.tensor(sequence_padding(batch_sim_token_ids), dtype=torch.long, device=device)
batch_sim_segment_ids = torch.tensor(sequence_padding(batch_sim_segment_ids), dtype=torch.long, device=device)
sim_vecs = simbert.predict([batch_sim_token_ids, batch_sim_segment_ids])[1]
sim_vecs /= (sim_vecs**2).sum(dim=-1, keepdims=True)**0.5
sims = torch.matmul(sim_vecs, sim_vecs.T)
# ==== 蒸馏用:结束 ====
return [batch_token_ids, batch_segment_ids], [batch_token_ids, batch_segment_ids, sims]
train_dataloader = DataLoader(MyDataset('../datasets/data_similarity.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', with_mlm=True, dropout_rate=0.2, application='unilm')
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class TotalLoss(nn.Module):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def forward(self, outputs, target):
seq_logit, sen_emb = outputs
seq_label, seq_mask, sims = target
seq2seq_loss = self.compute_loss_of_seq2seq(seq_logit, seq_label, seq_mask)
similarity_loss = self.compute_loss_of_similarity(sen_emb, sims)
return {'loss': seq2seq_loss + similarity_loss, 'seq2seq_loss': seq2seq_loss, 'similarity_loss': similarity_loss}
def compute_loss_of_seq2seq(self, y_pred, y_true, y_mask):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true = y_true[:, 1:] # 目标token_ids
y_mask = y_mask[:, 1:] # 指示了要预测的部分
y_pred = y_pred[:, :-1, :] # 预测序列,错开一位
y_pred = y_pred.reshape(-1, y_pred.shape[-1])
y_true = (y_true*y_mask).flatten()
return F.cross_entropy(y_pred, y_true, ignore_index=0)
def compute_loss_of_similarity(self, y_pred, y_true):
y_pred = F.normalize(y_pred, p=2, dim=-1) # 句向量归一化
similarities = torch.matmul(y_pred, y_pred.T) # 相似度矩阵
loss = 100 * torch.mean((similarities - y_true) ** 2)
return loss
model.compile(loss=TotalLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
def just_show(some_samples):
"""随机观察一些样本的效果
"""
S = [np.random.choice(some_samples) for _ in range(3)]
for s in S:
try:
print(u'原句子:%s' % s)
print(u'同义句子:', gen_synonyms(s, 10, 10))
print()
except:
pass
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show(['微信和支付宝拿个好用?',
'微信和支付宝,哪个好?',
'微信和支付宝哪个好',
'支付宝和微信哪个好',
'支付宝和微信哪个好啊',
'微信和支付宝那个好用?',
'微信和支付宝哪个好用',
'支付宝和微信那个更好',
'支付宝和微信哪个好用',
'微信和支付宝用起来哪个好?',
'微信和支付宝选哪个好'
])
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# SimBERT_v2监督训练代码supervised部分
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, ListDataset, text_segmentate
from bert4torch.snippets import Callback, truncate_sequences, get_pool_emb
from bert4torch.tokenizers import Tokenizer
import json
import glob
# 基本信息
maxlen = 64
batch_size = 12
labels = ['contradiction', 'entailment', 'neutral']
# bert配置,需要加载stage2训练后的权重,这里直接加载官方最终的权重以示例
config_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def split(text):
"""分割句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen * 1.2, seps, strips)
class MyDataset(ListDataset):
def load_data(self, file_path):
dataset1_path, dataset2_path = file_path
D1 = self.load_data_1(dataset1_path)
D2 = self.load_data_2(dataset2_path)
return D1 + D2
@staticmethod
def load_data_1(filenames, threshold=0.5):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
l = l.strip().split('\t')
if len(l) != 3:
continue
l[0], l[1] = split(l[0])[0], split(l[1])[0]
D.append((l[0], l[1], int(float(l[2]) > threshold)))
return D
@staticmethod
def load_data_2(dir_path):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D = []
for filename in glob.glob(dir_path):
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if l['gold_label'] not in labels:
continue
text1 = split(l['sentence1'])[0]
text2 = split(l['sentence2'])[0]
label = labels.index(l['gold_label']) + 2
D.append((text1, text2, label))
return D
def truncate(text):
"""截断句子
"""
seps, strips = u'\n。!?!?;;,, ', u';;,, '
return text_segmentate(text, maxlen - 2, seps, strips)[0]
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text1, text2, label in batch:
for text in [text1, text2]:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels
# 加载数据集
data_path = 'F:/Projects/data/corpus/sentence_embedding/'
dataset1_path = []
for task_name in ['ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STS-B']:
for f in ['train', 'valid']:
threshold = 2.5 if task_name == 'STS-B' else 0.5
filename = '%s%s/%s.%s.data' % (data_path, task_name, task_name, f)
dataset1_path.append(filename)
dataset2_path = 'F:/Projects/data/corpus/sentence_embedding/XNLI-MT-1.0/cnsd/cnsd-*/*.jsonl'
train_dataloader = DataLoader(MyDataset([dataset1_path, dataset2_path]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer',
with_pool='linear', dropout_rate=0.2)
self.pool_method = pool_method
self.dense = nn.Linear(768*3, 5, bias=False)
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method) # [btz*2, hdsz]
# 向量合并:a、b、|a-b|拼接
u, v = sen_emb[::2], sen_emb[1::2]
sen_emb_concat = torch.cat([u, v, torch.abs(u-v)], dim=-1) # [btz, hdsz*3]
y_pred = self.dense(sen_emb_concat) # [btz, 5]
return y_pred
model = Model(pool_method='cls').to(device)
class MyLoss(nn.Module):
"""loss分
"""
def __init__(self) -> None:
super().__init__()
self.mask = torch.tensor([0,0,1,1,1], device=device)
def forward(self, y_pred, y_true):
'''如果是两分类数据,则把后三位置-inf,如果是三分类数据,把前两位置-inf
'''
task = (y_true < 1.5).long()
y_pred_1 = y_pred - self.mask * 1e12
y_pred_2 = y_pred - (1-self.mask) * 1e12
y_pred = task * y_pred_1 + (1-task) * y_pred_2
return F.cross_entropy(y_pred, y_true.flatten())
model.compile(loss=MyLoss(), optimizer=optim.Adam(model.parameters(), 1e-5), metrics=['seq2seq_loss', 'similarity_loss'])
class Evaluator(Callback):
"""评估模型
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, steps_per_epoch=200, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment