Commit 66a1d0d0 authored by yangzhong's avatar yangzhong
Browse files

提交初版bert4torch project

parents
Pipeline #519 canceled with stages
#! -*- coding: utf-8 -*-
# 基础测试:mlm预测
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
# 加载模型,请更换成自己的路径
root_model_path = "F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/bert_config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("科学技术是第一生产力")
token_ids[3] = token_ids[4] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数with_mlm
model.eval()
with torch.no_grad():
_, probas = model([tokens_ids_tensor, segment_ids_tensor])
result = torch.argmax(probas[0, 3:5], dim=-1).numpy()
print(tokenizer.decode(result))
#! -*- coding: utf-8 -*-
# 基础测试:mlm测试roformer、roformer_v2模型
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
choice = 'roformer_v2' # roformer roformer_v2
if choice == 'roformer':
args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/"
args_model = 'roformer'
else:
args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/"
args_model = 'roformer_v2'
# 加载模型,请更换成自己的路径
root_model_path = args_model_path
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, model=args_model, with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("今天M很好,我M去公园玩。")
token_ids[3] = token_ids[8] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数with_mlm
model.eval()
with torch.no_grad():
_, logits = model([tokens_ids_tensor, segment_ids_tensor])
pred_str = 'Predict: '
for i, logit in enumerate(logits[0]):
if token_ids[i] == tokenizer._token_mask_id:
pred_str += tokenizer.id_to_token(torch.argmax(logit, dim=-1).item())
else:
pred_str += tokenizer.id_to_token(token_ids[i])
print(pred_str)
#! -*- coding: utf-8 -*-
# 利用自带的接口,将SimBERT的同义句生成搭建成Web服务。
# 基于bottlepy简单封装,仅作为临时测试使用,不保证性能。
# 具体用法请看 https://github.com/bojone/bert4keras/blob/8ffb46a16a79f87aa8cdf045df7994036b4be47d/bert4keras/snippets.py#L580
import torch
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, get_pool_emb
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import WebServing
# 基本信息
maxlen = 32
choice = 'simbert' # simbert simbert_v2
if choice == 'simbert':
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base"
args_model = 'bert'
else:
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base"
args_model = 'roformer'
# 加载simbert权重或roformer_v2
root_model_path = args_model_path
dict_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear', model=args_model,
application='unilm', keep_tokens=keep_tokens)
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pool_cls, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pool_cls, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
if __name__ == '__main__':
arguments = {'text': (None, True), 'n': (int, False), 'k': (int, False)}
web = WebServing(port=8864)
web.route('/gen_synonyms', gen_synonyms, arguments)
web.start()
# 现在可以测试访问 http://127.0.0.1:8864/gen_synonyms?text=苹果多少钱一斤
# tensorflow权重链接:https://github.com/ZhuiyiTechnology/GAU-alpha
# 这里直接映射到GAU_alpha的结构上了,因此不需要mapping
import torch
import tensorflow as tf
tf_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-tf]--chinese_GAU-alpha-char_L-24_H-768/bert_model.ckpt'
torch_state_dict = {}
ts = tf.train.load_variable(tf_path, 'bert/embeddings/word_embeddings')
torch_state_dict['embeddings.word_embeddings.weight'] = torch.from_numpy(ts)
torch_state_dict['mlmDecoder.weight'] = torch.from_numpy(ts)
ts = tf.train.load_variable(tf_path, 'bert/embeddings/token_type_embeddings')
torch_state_dict['embeddings.segment_embeddings.weight'] = torch.from_numpy(ts)
for i in range(24):
ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/i_dense/kernel')
torch_state_dict[f'encoderLayer.{i}.gau.i_dense.weight'] = torch.from_numpy(ts.T)
ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/o_dense/kernel')
torch_state_dict[f'encoderLayer.{i}.gau.o_dense.weight'] = torch.from_numpy(ts.T)
ts1 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/q_scaleoffset/gamma')
ts2 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/k_scaleoffset/gamma')
ts = torch.stack([torch.from_numpy(ts1), torch.from_numpy(ts2)], dim=0)
torch_state_dict[f'encoderLayer.{i}.gau.offsetscale.gamma'] = ts
torch.save(torch_state_dict, 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin')
# config文件
'''
{
"hidden_act": "swish",
"hidden_size": 768,
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"num_attention_heads": 1,
"attention_key_size": 128,
"intermediate_size": 1536,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 12000
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将cloudwalk的预训练bart模型转换为bert4keras可用的权重
# 权重链接百度云地址:
import torch
ckpt_file = 'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/pytorch_base_model_2024000.pt'
torch_weights = torch.load(ckpt_file)
map = {'bart.embeddings.word_embeddings.weight': 'encoder.embed_tokens.weight',
'bart.embeddings.position_embeddings.weight': 'encoder.embed_positions.weight',
'bart.embeddings.LayerNorm.weight': 'encoder.layernorm_embedding.weight',
'bart.embeddings.LayerNorm.bias': 'encoder.layernorm_embedding.bias',
'bart.encoder.encoder_layer.0.attention.self.query.weight': 'encoder.layers.0.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.query.bias': 'encoder.layers.0.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.0.attention.self.key.weight': 'encoder.layers.0.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.key.bias': 'encoder.layers.0.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.0.attention.self.value.weight': 'encoder.layers.0.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.0.attention.self.value.bias': 'encoder.layers.0.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.0.attention.output.dense.weight': 'encoder.layers.0.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.0.attention.output.dense.bias': 'encoder.layers.0.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.weight': 'encoder.layers.0.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.bias': 'encoder.layers.0.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.0.intermediate.dense.weight': 'encoder.layers.0.fc1.weight',
'bart.encoder.encoder_layer.0.intermediate.dense.bias': 'encoder.layers.0.fc1.bias',
'bart.encoder.encoder_layer.0.output.dense.weight': 'encoder.layers.0.fc2.weight',
'bart.encoder.encoder_layer.0.output.dense.bias': 'encoder.layers.0.fc2.bias',
'bart.encoder.encoder_layer.0.output.LayerNorm.weight': 'encoder.layers.0.final_layer_norm.weight',
'bart.encoder.encoder_layer.0.output.LayerNorm.bias': 'encoder.layers.0.final_layer_norm.bias',
'bart.encoder.encoder_layer.1.attention.self.query.weight': 'encoder.layers.1.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.query.bias': 'encoder.layers.1.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.1.attention.self.key.weight': 'encoder.layers.1.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.key.bias': 'encoder.layers.1.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.1.attention.self.value.weight': 'encoder.layers.1.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.1.attention.self.value.bias': 'encoder.layers.1.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.1.attention.output.dense.weight': 'encoder.layers.1.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.1.attention.output.dense.bias': 'encoder.layers.1.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.weight': 'encoder.layers.1.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.bias': 'encoder.layers.1.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.1.intermediate.dense.weight': 'encoder.layers.1.fc1.weight',
'bart.encoder.encoder_layer.1.intermediate.dense.bias': 'encoder.layers.1.fc1.bias',
'bart.encoder.encoder_layer.1.output.dense.weight': 'encoder.layers.1.fc2.weight',
'bart.encoder.encoder_layer.1.output.dense.bias': 'encoder.layers.1.fc2.bias',
'bart.encoder.encoder_layer.1.output.LayerNorm.weight': 'encoder.layers.1.final_layer_norm.weight',
'bart.encoder.encoder_layer.1.output.LayerNorm.bias': 'encoder.layers.1.final_layer_norm.bias',
'bart.encoder.encoder_layer.2.attention.self.query.weight': 'encoder.layers.2.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.query.bias': 'encoder.layers.2.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.2.attention.self.key.weight': 'encoder.layers.2.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.key.bias': 'encoder.layers.2.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.2.attention.self.value.weight': 'encoder.layers.2.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.2.attention.self.value.bias': 'encoder.layers.2.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.2.attention.output.dense.weight': 'encoder.layers.2.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.2.attention.output.dense.bias': 'encoder.layers.2.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.weight': 'encoder.layers.2.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.bias': 'encoder.layers.2.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.2.intermediate.dense.weight': 'encoder.layers.2.fc1.weight',
'bart.encoder.encoder_layer.2.intermediate.dense.bias': 'encoder.layers.2.fc1.bias',
'bart.encoder.encoder_layer.2.output.dense.weight': 'encoder.layers.2.fc2.weight',
'bart.encoder.encoder_layer.2.output.dense.bias': 'encoder.layers.2.fc2.bias',
'bart.encoder.encoder_layer.2.output.LayerNorm.weight': 'encoder.layers.2.final_layer_norm.weight',
'bart.encoder.encoder_layer.2.output.LayerNorm.bias': 'encoder.layers.2.final_layer_norm.bias',
'bart.encoder.encoder_layer.3.attention.self.query.weight': 'encoder.layers.3.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.query.bias': 'encoder.layers.3.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.3.attention.self.key.weight': 'encoder.layers.3.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.key.bias': 'encoder.layers.3.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.3.attention.self.value.weight': 'encoder.layers.3.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.3.attention.self.value.bias': 'encoder.layers.3.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.3.attention.output.dense.weight': 'encoder.layers.3.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.3.attention.output.dense.bias': 'encoder.layers.3.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.weight': 'encoder.layers.3.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.bias': 'encoder.layers.3.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.3.intermediate.dense.weight': 'encoder.layers.3.fc1.weight',
'bart.encoder.encoder_layer.3.intermediate.dense.bias': 'encoder.layers.3.fc1.bias',
'bart.encoder.encoder_layer.3.output.dense.weight': 'encoder.layers.3.fc2.weight',
'bart.encoder.encoder_layer.3.output.dense.bias': 'encoder.layers.3.fc2.bias',
'bart.encoder.encoder_layer.3.output.LayerNorm.weight': 'encoder.layers.3.final_layer_norm.weight',
'bart.encoder.encoder_layer.3.output.LayerNorm.bias': 'encoder.layers.3.final_layer_norm.bias',
'bart.encoder.encoder_layer.4.attention.self.query.weight': 'encoder.layers.4.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.query.bias': 'encoder.layers.4.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.4.attention.self.key.weight': 'encoder.layers.4.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.key.bias': 'encoder.layers.4.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.4.attention.self.value.weight': 'encoder.layers.4.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.4.attention.self.value.bias': 'encoder.layers.4.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.4.attention.output.dense.weight': 'encoder.layers.4.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.4.attention.output.dense.bias': 'encoder.layers.4.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.weight': 'encoder.layers.4.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.bias': 'encoder.layers.4.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.4.intermediate.dense.weight': 'encoder.layers.4.fc1.weight',
'bart.encoder.encoder_layer.4.intermediate.dense.bias': 'encoder.layers.4.fc1.bias',
'bart.encoder.encoder_layer.4.output.dense.weight': 'encoder.layers.4.fc2.weight',
'bart.encoder.encoder_layer.4.output.dense.bias': 'encoder.layers.4.fc2.bias',
'bart.encoder.encoder_layer.4.output.LayerNorm.weight': 'encoder.layers.4.final_layer_norm.weight',
'bart.encoder.encoder_layer.4.output.LayerNorm.bias': 'encoder.layers.4.final_layer_norm.bias',
'bart.encoder.encoder_layer.5.attention.self.query.weight': 'encoder.layers.5.self_attn.q_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.query.bias': 'encoder.layers.5.self_attn.q_proj.bias',
'bart.encoder.encoder_layer.5.attention.self.key.weight': 'encoder.layers.5.self_attn.k_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.key.bias': 'encoder.layers.5.self_attn.k_proj.bias',
'bart.encoder.encoder_layer.5.attention.self.value.weight': 'encoder.layers.5.self_attn.v_proj.weight',
'bart.encoder.encoder_layer.5.attention.self.value.bias': 'encoder.layers.5.self_attn.v_proj.bias',
'bart.encoder.encoder_layer.5.attention.output.dense.weight': 'encoder.layers.5.self_attn.out_proj.weight',
'bart.encoder.encoder_layer.5.attention.output.dense.bias': 'encoder.layers.5.self_attn.out_proj.bias',
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.weight': 'encoder.layers.5.self_attn_layer_norm.weight',
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.bias': 'encoder.layers.5.self_attn_layer_norm.bias',
'bart.encoder.encoder_layer.5.intermediate.dense.weight': 'encoder.layers.5.fc1.weight',
'bart.encoder.encoder_layer.5.intermediate.dense.bias': 'encoder.layers.5.fc1.bias',
'bart.encoder.encoder_layer.5.output.dense.weight': 'encoder.layers.5.fc2.weight',
'bart.encoder.encoder_layer.5.output.dense.bias': 'encoder.layers.5.fc2.bias',
'bart.encoder.encoder_layer.5.output.LayerNorm.weight': 'encoder.layers.5.final_layer_norm.weight',
'bart.encoder.encoder_layer.5.output.LayerNorm.bias': 'encoder.layers.5.final_layer_norm.bias',
'bart.decoder.decoder_layer.0.attention.self.query.weight': 'decoder.layers.0.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.query.bias': 'decoder.layers.0.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.0.attention.self.key.weight': 'decoder.layers.0.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.key.bias': 'decoder.layers.0.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.0.attention.self.value.weight': 'decoder.layers.0.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.0.attention.self.value.bias': 'decoder.layers.0.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.0.attention.output.dense.weight': 'decoder.layers.0.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.0.attention.output.dense.bias': 'decoder.layers.0.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.weight': 'decoder.layers.0.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.bias': 'decoder.layers.0.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.0.crossattention.self.query.weight': 'decoder.layers.0.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.query.bias': 'decoder.layers.0.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.self.key.weight': 'decoder.layers.0.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.key.bias': 'decoder.layers.0.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.self.value.weight': 'decoder.layers.0.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.self.value.bias': 'decoder.layers.0.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.output.dense.weight': 'decoder.layers.0.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.0.crossattention.output.dense.bias': 'decoder.layers.0.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.weight': 'decoder.layers.0.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.bias': 'decoder.layers.0.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.0.intermediate.dense.weight': 'decoder.layers.0.fc1.weight',
'bart.decoder.decoder_layer.0.intermediate.dense.bias': 'decoder.layers.0.fc1.bias',
'bart.decoder.decoder_layer.0.output.dense.weight': 'decoder.layers.0.fc2.weight',
'bart.decoder.decoder_layer.0.output.dense.bias': 'decoder.layers.0.fc2.bias',
'bart.decoder.decoder_layer.0.output.LayerNorm.weight': 'decoder.layers.0.final_layer_norm.weight',
'bart.decoder.decoder_layer.0.output.LayerNorm.bias': 'decoder.layers.0.final_layer_norm.bias',
'bart.decoder.decoder_layer.1.attention.self.query.weight': 'decoder.layers.1.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.query.bias': 'decoder.layers.1.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.1.attention.self.key.weight': 'decoder.layers.1.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.key.bias': 'decoder.layers.1.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.1.attention.self.value.weight': 'decoder.layers.1.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.1.attention.self.value.bias': 'decoder.layers.1.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.1.attention.output.dense.weight': 'decoder.layers.1.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.1.attention.output.dense.bias': 'decoder.layers.1.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.weight': 'decoder.layers.1.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.bias': 'decoder.layers.1.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.1.crossattention.self.query.weight': 'decoder.layers.1.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.query.bias': 'decoder.layers.1.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.self.key.weight': 'decoder.layers.1.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.key.bias': 'decoder.layers.1.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.self.value.weight': 'decoder.layers.1.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.self.value.bias': 'decoder.layers.1.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.output.dense.weight': 'decoder.layers.1.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.1.crossattention.output.dense.bias': 'decoder.layers.1.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.weight': 'decoder.layers.1.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.bias': 'decoder.layers.1.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.1.intermediate.dense.weight': 'decoder.layers.1.fc1.weight',
'bart.decoder.decoder_layer.1.intermediate.dense.bias': 'decoder.layers.1.fc1.bias',
'bart.decoder.decoder_layer.1.output.dense.weight': 'decoder.layers.1.fc2.weight',
'bart.decoder.decoder_layer.1.output.dense.bias': 'decoder.layers.1.fc2.bias',
'bart.decoder.decoder_layer.1.output.LayerNorm.weight': 'decoder.layers.1.final_layer_norm.weight',
'bart.decoder.decoder_layer.1.output.LayerNorm.bias': 'decoder.layers.1.final_layer_norm.bias',
'bart.decoder.decoder_layer.2.attention.self.query.weight': 'decoder.layers.2.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.query.bias': 'decoder.layers.2.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.2.attention.self.key.weight': 'decoder.layers.2.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.key.bias': 'decoder.layers.2.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.2.attention.self.value.weight': 'decoder.layers.2.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.2.attention.self.value.bias': 'decoder.layers.2.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.2.attention.output.dense.weight': 'decoder.layers.2.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.2.attention.output.dense.bias': 'decoder.layers.2.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.weight': 'decoder.layers.2.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.bias': 'decoder.layers.2.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.2.crossattention.self.query.weight': 'decoder.layers.2.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.query.bias': 'decoder.layers.2.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.self.key.weight': 'decoder.layers.2.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.key.bias': 'decoder.layers.2.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.self.value.weight': 'decoder.layers.2.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.self.value.bias': 'decoder.layers.2.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.output.dense.weight': 'decoder.layers.2.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.2.crossattention.output.dense.bias': 'decoder.layers.2.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.weight': 'decoder.layers.2.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.bias': 'decoder.layers.2.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.2.intermediate.dense.weight': 'decoder.layers.2.fc1.weight',
'bart.decoder.decoder_layer.2.intermediate.dense.bias': 'decoder.layers.2.fc1.bias',
'bart.decoder.decoder_layer.2.output.dense.weight': 'decoder.layers.2.fc2.weight',
'bart.decoder.decoder_layer.2.output.dense.bias': 'decoder.layers.2.fc2.bias',
'bart.decoder.decoder_layer.2.output.LayerNorm.weight': 'decoder.layers.2.final_layer_norm.weight',
'bart.decoder.decoder_layer.2.output.LayerNorm.bias': 'decoder.layers.2.final_layer_norm.bias',
'bart.decoder.decoder_layer.3.attention.self.query.weight': 'decoder.layers.3.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.query.bias': 'decoder.layers.3.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.3.attention.self.key.weight': 'decoder.layers.3.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.key.bias': 'decoder.layers.3.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.3.attention.self.value.weight': 'decoder.layers.3.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.3.attention.self.value.bias': 'decoder.layers.3.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.3.attention.output.dense.weight': 'decoder.layers.3.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.3.attention.output.dense.bias': 'decoder.layers.3.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.weight': 'decoder.layers.3.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.bias': 'decoder.layers.3.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.3.crossattention.self.query.weight': 'decoder.layers.3.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.query.bias': 'decoder.layers.3.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.self.key.weight': 'decoder.layers.3.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.key.bias': 'decoder.layers.3.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.self.value.weight': 'decoder.layers.3.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.self.value.bias': 'decoder.layers.3.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.output.dense.weight': 'decoder.layers.3.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.3.crossattention.output.dense.bias': 'decoder.layers.3.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.weight': 'decoder.layers.3.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.bias': 'decoder.layers.3.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.3.intermediate.dense.weight': 'decoder.layers.3.fc1.weight',
'bart.decoder.decoder_layer.3.intermediate.dense.bias': 'decoder.layers.3.fc1.bias',
'bart.decoder.decoder_layer.3.output.dense.weight': 'decoder.layers.3.fc2.weight',
'bart.decoder.decoder_layer.3.output.dense.bias': 'decoder.layers.3.fc2.bias',
'bart.decoder.decoder_layer.3.output.LayerNorm.weight': 'decoder.layers.3.final_layer_norm.weight',
'bart.decoder.decoder_layer.3.output.LayerNorm.bias': 'decoder.layers.3.final_layer_norm.bias',
'bart.decoder.decoder_layer.4.attention.self.query.weight': 'decoder.layers.4.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.query.bias': 'decoder.layers.4.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.4.attention.self.key.weight': 'decoder.layers.4.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.key.bias': 'decoder.layers.4.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.4.attention.self.value.weight': 'decoder.layers.4.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.4.attention.self.value.bias': 'decoder.layers.4.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.4.attention.output.dense.weight': 'decoder.layers.4.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.4.attention.output.dense.bias': 'decoder.layers.4.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.weight': 'decoder.layers.4.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.bias': 'decoder.layers.4.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.4.crossattention.self.query.weight': 'decoder.layers.4.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.query.bias': 'decoder.layers.4.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.self.key.weight': 'decoder.layers.4.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.key.bias': 'decoder.layers.4.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.self.value.weight': 'decoder.layers.4.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.self.value.bias': 'decoder.layers.4.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.output.dense.weight': 'decoder.layers.4.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.4.crossattention.output.dense.bias': 'decoder.layers.4.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.weight': 'decoder.layers.4.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.bias': 'decoder.layers.4.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.4.intermediate.dense.weight': 'decoder.layers.4.fc1.weight',
'bart.decoder.decoder_layer.4.intermediate.dense.bias': 'decoder.layers.4.fc1.bias',
'bart.decoder.decoder_layer.4.output.dense.weight': 'decoder.layers.4.fc2.weight',
'bart.decoder.decoder_layer.4.output.dense.bias': 'decoder.layers.4.fc2.bias',
'bart.decoder.decoder_layer.4.output.LayerNorm.weight': 'decoder.layers.4.final_layer_norm.weight',
'bart.decoder.decoder_layer.4.output.LayerNorm.bias': 'decoder.layers.4.final_layer_norm.bias',
'bart.decoder.decoder_layer.5.attention.self.query.weight': 'decoder.layers.5.self_attn.q_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.query.bias': 'decoder.layers.5.self_attn.q_proj.bias',
'bart.decoder.decoder_layer.5.attention.self.key.weight': 'decoder.layers.5.self_attn.k_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.key.bias': 'decoder.layers.5.self_attn.k_proj.bias',
'bart.decoder.decoder_layer.5.attention.self.value.weight': 'decoder.layers.5.self_attn.v_proj.weight',
'bart.decoder.decoder_layer.5.attention.self.value.bias': 'decoder.layers.5.self_attn.v_proj.bias',
'bart.decoder.decoder_layer.5.attention.output.dense.weight': 'decoder.layers.5.self_attn.out_proj.weight',
'bart.decoder.decoder_layer.5.attention.output.dense.bias': 'decoder.layers.5.self_attn.out_proj.bias',
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.weight': 'decoder.layers.5.self_attn_layer_norm.weight',
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.bias': 'decoder.layers.5.self_attn_layer_norm.bias',
'bart.decoder.decoder_layer.5.crossattention.self.query.weight': 'decoder.layers.5.encoder_attn.q_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.query.bias': 'decoder.layers.5.encoder_attn.q_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.self.key.weight': 'decoder.layers.5.encoder_attn.k_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.key.bias': 'decoder.layers.5.encoder_attn.k_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.self.value.weight': 'decoder.layers.5.encoder_attn.v_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.self.value.bias': 'decoder.layers.5.encoder_attn.v_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.output.dense.weight': 'decoder.layers.5.encoder_attn.out_proj.weight',
'bart.decoder.decoder_layer.5.crossattention.output.dense.bias': 'decoder.layers.5.encoder_attn.out_proj.bias',
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.weight': 'decoder.layers.5.encoder_attn_layer_norm.weight',
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.bias': 'decoder.layers.5.encoder_attn_layer_norm.bias',
'bart.decoder.decoder_layer.5.intermediate.dense.weight': 'decoder.layers.5.fc1.weight',
'bart.decoder.decoder_layer.5.intermediate.dense.bias': 'decoder.layers.5.fc1.bias',
'bart.decoder.decoder_layer.5.output.dense.weight': 'decoder.layers.5.fc2.weight',
'bart.decoder.decoder_layer.5.output.dense.bias': 'decoder.layers.5.fc2.bias',
'bart.decoder.decoder_layer.5.output.LayerNorm.weight': 'decoder.layers.5.final_layer_norm.weight',
'bart.decoder.decoder_layer.5.output.LayerNorm.bias': 'decoder.layers.5.final_layer_norm.bias'}
model_new = {}
for key, value in map.items():
model_new[value] = torch_weights[key]
torch.save(model_new, 'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/bert4torch_pytorch_model.bin')
\ No newline at end of file
# 转换huggingface上bert-base-chinese权重
# 权重链接:https://huggingface.co/bert-base-chinese
# 由于key和框架的key没有完全对齐,主要里面用的都是Laynorm.gamma和Laynorm.beta来保存权重和偏置
import torch
state_dict = torch.load('F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/pytorch_model.bin')
state_dict_new = {}
for k, v in state_dict.items():
if 'LayerNorm.gamma' in k:
k = k.replace('LayerNorm.gamma', 'LayerNorm.weight')
state_dict_new[k] = v
elif 'LayerNorm.beta' in k:
k = k.replace('LayerNorm.beta', 'LayerNorm.bias')
state_dict_new[k] = v
else:
state_dict_new[k] = v
torch.save(state_dict_new, 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin')
# config配置
'''
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 21128
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将清华开源的中文GPT2模型(26亿参数)
# 项目链接(tf版本):https://github.com/TsinghuaAI/CPM-Generate
# pytorch版权重下载链接:https://huggingface.co/TsinghuaAI/CPM-Generate,经过本脚本转成bert4torch适用的权重
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = f'{ckpt_dir}/bert4torch_pytorch_model.bin'
num_hidden_layers = 32
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt2'
w = torch_weights['transformer.wte.weight']
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['transformer.wpe.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['transformer.h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['transformer.h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['transformer.h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['transformer.h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['transformer.h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['transformer.h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
# layernorm_final
w = torch_weights['transformer.ln_f.weight']
new_weights[f'{prefix}.LayerNormFinal.weight'] = w
b = torch_weights['transformer.ln_f.bias']
new_weights[f'{prefix}.LayerNormFinal.bias'] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"vocab_size": 30000,
"hidden_size": 2560,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 10240,
"max_position_embeddings": 1024,
"num_attention_heads": 32,
"num_hidden_layers": 32
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# gpt2-ml
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# pytorch权重转换和下载:https://github.com/ghosthamlet/gpt2-ml-torch
# 最后经过本脚本转成bert4torch适用的权重
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = f'{ckpt_dir}/bert4torch_pytorch_model.bin'
num_hidden_layers = 48
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt2_ml'
w = torch_weights['wte.weight']
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['wpe.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
# embedding layernorm
w = torch_weights['emb_norm.weight']
new_weights[f'{prefix}.embeddings.LayerNorm.weight'] = w
b = torch_weights['emb_norm.bias']
new_weights[f'{prefix}.embeddings.LayerNorm.bias'] = b
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 将CDial-GPT的pytorch权重转换为bert4torch可适配的权重,base和large都可转换
# 项目链接(torch版本):https://github.com/thu-coai/CDial-GPT
import torch
ckpt_dir = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base'
ckpt_file = f'{ckpt_dir}/pytorch_model.bin'
output_ckpt_file = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
num_hidden_layers = 12
def convert():
torch_weights = torch.load(ckpt_file)
new_weights = {}
prefix = 'gpt'
# CDial-GPT的[CLS]是0、[PAD]是1,不符合一般习惯,所以交换一下
w = torch_weights['transformer.tokens_embed.weight']
w = torch.cat([w[1:2], w[:1], w[2:]], axis=0)
new_weights[f'{prefix}.embeddings.word_embeddings.weight'] = w
w = torch_weights['transformer.positions_embed.weight']
new_weights[f'{prefix}.embeddings.position_embeddings.weight'] = w
qkv = ['query', 'key', 'value']
for i in range(num_hidden_layers):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
# q, k, v
w = torch_weights['transformer.h.%s.attn.c_attn.weight' % i]
ws = torch.chunk(w, 3, dim=1)
for k, w in zip(qkv, ws):
name = prefix_i + f'attention.self.{k}.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_attn.bias' % i]
bs = torch.chunk(b, 3, dim=0)
for k, b in zip(qkv, bs):
name = prefix_i + f'attention.self.{k}.bias'
new_weights[name] = b
# hdsz-hdsz的全连接
w = torch_weights['transformer.h.%s.attn.c_proj.weight' % i]
name = prefix_i + 'attention.output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.attn.c_proj.bias' % i]
name = prefix_i + 'attention.output.dense.bias'
new_weights[name] = b
# layernorm1
w = torch_weights['transformer.h.%s.ln_1.weight' % i]
name = prefix_i + 'attention.output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_1.bias' % i]
name = prefix_i + 'attention.output.LayerNorm.bias'
new_weights[name] = b
# feed forward 第一层
w = torch_weights['transformer.h.%s.mlp.c_fc.weight' % i]
name = prefix_i + 'intermediate.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_fc.bias' % i]
name = prefix_i + 'intermediate.dense.bias'
new_weights[name] = b
# feed forward 第二层
w = torch_weights['transformer.h.%s.mlp.c_proj.weight' % i]
name = prefix_i + 'output.dense.weight'
new_weights[name] = w.T
b = torch_weights['transformer.h.%s.mlp.c_proj.bias' % i]
name = prefix_i + 'output.dense.bias'
new_weights[name] = b
# layernorm2
w = torch_weights['transformer.h.%s.ln_2.weight' % i]
name = prefix_i + 'output.LayerNorm.weight'
new_weights[name] = w
b = torch_weights['transformer.h.%s.ln_2.bias' % i]
name = prefix_i + 'output.LayerNorm.bias'
new_weights[name] = b
torch.save(new_weights, output_ckpt_file)
if __name__ == '__main__':
convert()
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 513,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 13088,
"type_vocab_size": 3,
"shared_segment_embeddings": true
}
'''
\ No newline at end of file
# NEZHA模型做闲聊任务,苏神已经finetune好的权重,注意不是预训练模型
# 源项目:https://github.com/bojone/nezha_gpt_dialog
import torch
import tensorflow as tf
tf_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/model.ckpt'
torch_state_dict = {}
prefix = 'bert'
mapping = {
'bert/embeddings/word_embeddings': f'{prefix}.embeddings.word_embeddings.weight',
'bert/embeddings/token_type_embeddings': f'{prefix}.embeddings.token_type_embeddings.weight',
'bert/embeddings/LayerNorm/beta': f'{prefix}.embeddings.LayerNorm.bias',
'bert/embeddings/LayerNorm/gamma': f'{prefix}.embeddings.LayerNorm.weight',
'cls/predictions/transform/dense/kernel': 'cls.predictions.transform.dense.weight##',
'cls/predictions/transform/dense/bias': 'cls.predictions.transform.dense.bias',
'cls/predictions/transform/LayerNorm/beta': 'cls.predictions.transform.LayerNorm.bias',
'cls/predictions/transform/LayerNorm/gamma': 'cls.predictions.transform.LayerNorm.weight',
'cls/predictions/output_bias': 'cls.predictions.bias'
}
for i in range(12):
prefix_i = f'{prefix}.encoder.layer.%d.' % i
mapping.update({
f'bert/encoder/layer_{i}/attention/self/query/kernel': prefix_i + 'attention.self.query.weight##', # 转置标识
f'bert/encoder/layer_{i}/attention/self/query/bias': prefix_i + 'attention.self.query.bias',
f'bert/encoder/layer_{i}/attention/self/key/kernel': prefix_i + 'attention.self.key.weight##',
f'bert/encoder/layer_{i}/attention/self/key/bias': prefix_i + 'attention.self.key.bias',
f'bert/encoder/layer_{i}/attention/self/value/kernel': prefix_i + 'attention.self.value.weight##',
f'bert/encoder/layer_{i}/attention/self/value/bias': prefix_i + 'attention.self.value.bias',
f'bert/encoder/layer_{i}/attention/output/dense/kernel': prefix_i + 'attention.output.dense.weight##',
f'bert/encoder/layer_{i}/attention/output/dense/bias': prefix_i + 'attention.output.dense.bias',
f'bert/encoder/layer_{i}/attention/output/LayerNorm/beta': prefix_i + 'attention.output.LayerNorm.bias',
f'bert/encoder/layer_{i}/attention/output/LayerNorm/gamma': prefix_i + 'attention.output.LayerNorm.weight',
f'bert/encoder/layer_{i}/intermediate/dense/kernel': prefix_i + 'intermediate.dense.weight##',
f'bert/encoder/layer_{i}/intermediate/dense/bias': prefix_i + 'intermediate.dense.bias',
f'bert/encoder/layer_{i}/output/dense/kernel': prefix_i + 'output.dense.weight##',
f'bert/encoder/layer_{i}/output/dense/bias': prefix_i + 'output.dense.bias',
f'bert/encoder/layer_{i}/output/LayerNorm/beta': prefix_i + 'output.LayerNorm.bias',
f'bert/encoder/layer_{i}/output/LayerNorm/gamma': prefix_i + 'output.LayerNorm.weight'
})
for key, value in mapping.items():
ts = tf.train.load_variable(tf_path, key)
if value.endswith('##'):
value = value.replace('##', '')
torch_state_dict[value] = torch.from_numpy(ts).T
else:
torch_state_dict[value] = torch.from_numpy(ts)
torch_state_dict['cls.predictions.decoder.weight'] = torch_state_dict[f'{prefix}.embeddings.word_embeddings.weight']
torch_state_dict['cls.predictions.decoder.bias'] = torch_state_dict['cls.predictions.bias']
torch.save(torch_state_dict, 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin')
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"max_relative_position": 64,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 14195,
"use_relative_position": true
}
'''
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 只是转换苏神已经train好的模型,注意不是预训练模型
import numpy as np
import h5py
import torch
# 这里用的keras==2.3.1
from keras.engine import saving
tf_path = 'E:/Github/bert4keras/examples/best_model_chess.weights'
torch_state_dict = {}
# 1表示transpose, 0表示不变
key_map = {
'Embedding-Token/embeddings:0': ['embeddings.word_embeddings.weight', 0],
'Embedding-Segment/embeddings:0': ['embeddings.segment_embeddings.weight', 0],
'Embedding-Position/embeddings:0': ['embeddings.position_embeddings.weight', 0],
'Embedding-Norm/gamma:0': ['embeddings.layerNorm.weight', 0],
'Embedding-Norm/beta:0': ['embeddings.layerNorm.bias', 0],
'MLM-Dense/kernel:0': ['mlmDense.weight', 1],
'MLM-Dense/bias:0': ['mlmDense.bias', 0],
'MLM-Norm/gamma:0': ['mlmLayerNorm.weight', 0],
'MLM-Norm/beta:0': ['mlmLayerNorm.bias', 0],
'MLM-Bias/bias:0': ['mlmBias', 0],
}
for i in range(12):
key_map.update({
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+1}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.q.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+1}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.q.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+2}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.k.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+2}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.k.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+3}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.v.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+3}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.v.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+4}/kernel:0': [f'encoderLayer.{i}.multiHeadAttention.o.weight', 1],
f'Transformer-{i}-MultiHeadSelfAttention/dense_{i*6+4}/bias:0': [f'encoderLayer.{i}.multiHeadAttention.o.bias', 0],
f'Transformer-{i}-MultiHeadSelfAttention-Norm/gamma:0': [f'encoderLayer.{i}.layerNorm1.weight', 0],
f'Transformer-{i}-MultiHeadSelfAttention-Norm/beta:0': [f'encoderLayer.{i}.layerNorm1.bias', 0],
f'Transformer-{i}-FeedForward/dense_{i*6+5}/kernel:0': [f'encoderLayer.{i}.feedForward.intermediateDense.weight', 1],
f'Transformer-{i}-FeedForward/dense_{i*6+5}/bias:0': [f'encoderLayer.{i}.feedForward.intermediateDense.bias', 0],
f'Transformer-{i}-FeedForward/dense_{i*6+6}/kernel:0': [f'encoderLayer.{i}.feedForward.outputDense.weight', 1],
f'Transformer-{i}-FeedForward/dense_{i*6+6}/bias:0': [f'encoderLayer.{i}.feedForward.outputDense.bias', 0],
f'Transformer-{i}-FeedForward-Norm/gamma:0': [f'encoderLayer.{i}.layerNorm2.weight', 0],
f'Transformer-{i}-FeedForward-Norm/beta:0': [f'encoderLayer.{i}.layerNorm2.bias', 0],
})
consume_keys = set()
with h5py.File(tf_path, mode='r') as f:
if 'layer_names' not in f.attrs and 'model_weights' in f:
f = f['model_weights']
layer_names = saving.load_attributes_from_hdf5_group(f, 'layer_names')
weight_value_tuples = []
for k, name in enumerate(layer_names):
g = f[name]
weight_names = saving.load_attributes_from_hdf5_group(g, 'weight_names')
weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
for i, weight_name in enumerate(weight_names):
new_key = key_map[weight_name][0]
if key_map[weight_name][1] == 1: # transpose
torch_state_dict[new_key] = torch.from_numpy(weight_values[i]).T
else:
torch_state_dict[new_key] = torch.from_numpy(weight_values[i])
assert new_key not in consume_keys, 'duplicate keys'
consume_keys.add(new_key)
if hasattr(f, 'close'):
f.close()
elif hasattr(f.file, 'close'):
f.file.close()
torch_state_dict['mlmDecoder.weight'] = torch_state_dict['embeddings.word_embeddings.weight']
torch_state_dict['mlmDecoder.bias'] = torch_state_dict['mlmBias']
# for k, v in torch_state_dict.items():
# print(k, v.shape)
torch.save(torch_state_dict, 'E:/Github/bert4torch/examples/others/best_model_chess.pt')
# t5_pegasus从tf转为bert4torch适配的pytorch版本
# 权重链接:https://github.com/ZhuiyiTechnology/t5-pegasus
import torch
import tensorflow as tf
import json
# small
# tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_small]--chinese_t5_pegasus_small/'
# torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/pytorch_model.bin'
# base:
tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_base]--chinese_t5_pegasus_base/'
torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/pytorch_model.bin'
tf_path = tf_dir + 'model.ckpt'
with open(tf_dir + 'config.json', 'r', encoding='utf-8') as f:
config = json.load(f)
num_layers = config['num_hidden_layers']
torch_state_dict = {}
mapping = {
'shared/embedding': 'shared.weight',
'encoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T', # 自定义标记,##T结尾表示要转置
'encoder/rms_norm/scale': 'encoder.final_layer_norm.weight',
'decoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T',
'decoder/rms_norm/scale': 'decoder.final_layer_norm.weight',
'decoder/logits/kernel': 'lm_head.weight##T'
}
for i in range(num_layers):
i1 = str(i).rjust(3, '0')
mapping.update({
f'encoder/block_{i1}/layer_000/SelfAttention/q': f'encoder.block.{i}.layer.0.SelfAttention.q.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/k': f'encoder.block.{i}.layer.0.SelfAttention.k.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/v': f'encoder.block.{i}.layer.0.SelfAttention.v.weight##T',
f'encoder/block_{i1}/layer_000/SelfAttention/o': f'encoder.block.{i}.layer.0.SelfAttention.o.weight##T',
f'encoder/block_{i1}/layer_000/rms_norm/scale': f'encoder.block.{i}.layer.0.layer_norm.weight',
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_0/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight##T',
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_1/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight##T',
f'encoder/block_{i1}/layer_001/DenseReluDense/wo/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wo.weight##T',
f'encoder/block_{i1}/layer_001/rms_norm/scale': f'encoder.block.{i}.layer.1.layer_norm.weight',
f'decoder/block_{i1}/layer_000/SelfAttention/q': f'decoder.block.{i}.layer.0.SelfAttention.q.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/k': f'decoder.block.{i}.layer.0.SelfAttention.k.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/v': f'decoder.block.{i}.layer.0.SelfAttention.v.weight##T',
f'decoder/block_{i1}/layer_000/SelfAttention/o': f'decoder.block.{i}.layer.0.SelfAttention.o.weight##T',
f'decoder/block_{i1}/layer_000/rms_norm/scale': f'decoder.block.{i}.layer.0.layer_norm.weight',
f'decoder/block_{i1}/layer_001/EncDecAttention/q': f'decoder.block.{i}.layer.1.EncDecAttention.q.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/k': f'decoder.block.{i}.layer.1.EncDecAttention.k.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/v': f'decoder.block.{i}.layer.1.EncDecAttention.v.weight##T',
f'decoder/block_{i1}/layer_001/EncDecAttention/o': f'decoder.block.{i}.layer.1.EncDecAttention.o.weight##T',
f'decoder/block_{i1}/layer_001/rms_norm/scale': f'decoder.block.{i}.layer.1.layer_norm.weight',
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_0/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight##T',
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_1/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight##T',
f'decoder/block_{i1}/layer_002/DenseReluDense/wo/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wo.weight##T',
f'decoder/block_{i1}/layer_002/rms_norm/scale': f'decoder.block.{i}.layer.2.layer_norm.weight',
})
transpose_layers = ['']
for k, v in mapping.items():
ts = torch.from_numpy(tf.train.load_variable(tf_path, k))
# if len(ts.shape)==2 and ts.shape[0] == ts.shape[1]:
# print(k, v)
if v.endswith('##T'):
torch_state_dict[v.rstrip('##T')] = ts.T
else:
torch_state_dict[v] = ts
torch.save(torch_state_dict, torch_path)
# config文件
'''
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"num_attention_heads": 12,
"attention_head_size": 64,
"num_hidden_layers": 12,
"vocab_size": 50000,
"hidden_act": "gelu",
"relative_attention_num_buckets": 32
}
'''
\ No newline at end of file
# 权重链接:https://huggingface.co/transfo-xl-wt103
# 该项目是英文的:只用于bert4torch中transformer_xl的调试模型结构,并未实际用于finetune
import torch
ckpt_file = 'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/pytorch_model.bin'
torch_state_dict = {}
# 1表示transpose, 0表示不变
key_map = {
'transformer.word_emb.emb_layers.0.weight': 'embeddings.emb_layers.0.weight',
'transformer.word_emb.emb_layers.1.weight': 'embeddings.emb_layers.1.weight',
'transformer.word_emb.emb_layers.2.weight': 'embeddings.emb_layers.2.weight',
'transformer.word_emb.emb_layers.3.weight': 'embeddings.emb_layers.3.weight',
'transformer.word_emb.emb_projs.0': 'embeddings.emb_projs.0',
'transformer.word_emb.emb_projs.1': 'embeddings.emb_projs.1',
'transformer.word_emb.emb_projs.2': 'embeddings.emb_projs.2',
'transformer.word_emb.emb_projs.3': 'embeddings.emb_projs.3',
}
for i in range(18):
key_map.update({
f'transformer.layers.{i}.dec_attn.r_r_bias': f'encoderLayer.{i}.multiHeadAttention.r_r_bias',
f'transformer.layers.{i}.dec_attn.r_w_bias': f'encoderLayer.{i}.multiHeadAttention.r_w_bias',
f'transformer.layers.{i}.dec_attn.o_net.weight': f'encoderLayer.{i}.multiHeadAttention.o.weight',
f'transformer.layers.{i}.dec_attn.layer_norm.weight': f'encoderLayer.{i}.layerNorm1.weight',
f'transformer.layers.{i}.dec_attn.layer_norm.bias': f'encoderLayer.{i}.layerNorm1.bias',
f'transformer.layers.{i}.dec_attn.r_net.weight': f'encoderLayer.{i}.multiHeadAttention.r.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.0.weight': f'encoderLayer.{i}.feedForward.intermediateDense.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.0.bias': f'encoderLayer.{i}.feedForward.intermediateDense.bias',
f'transformer.layers.{i}.pos_ff.CoreNet.3.weight': f'encoderLayer.{i}.feedForward.outputDense.weight',
f'transformer.layers.{i}.pos_ff.CoreNet.3.bias': f'encoderLayer.{i}.feedForward.outputDense.bias',
f'transformer.layers.{i}.pos_ff.layer_norm.weight': f'encoderLayer.{i}.layerNorm2.weight',
f'transformer.layers.{i}.pos_ff.layer_norm.bias': f'encoderLayer.{i}.layerNorm2.bias',
})
torch_weights = torch.load(ckpt_file)
model_new = {}
for key, value in key_map.items():
model_new[value] = torch_weights[key]
for i in range(18):
qkv_net = torch_weights[f'transformer.layers.{i}.dec_attn.qkv_net.weight']
model_new[f'encoderLayer.{i}.multiHeadAttention.q.weight'], model_new[f'encoderLayer.{i}.multiHeadAttention.k.weight'], model_new[f'encoderLayer.{i}.multiHeadAttention.v.weight'] = qkv_net.chunk(3, dim=0)
torch.save(model_new, 'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/bert4torch_pytorch_model.bin')
# config文件
'''
{
"adaptive": true,
"architectures": [
"TransfoXLLMHeadModel"
],
"attn_type": 0,
"clamp_len": 1000,
"cutoffs": [
20000,
40000,
200000
],
"d_embed": 1024,
"d_head": 64,
"intermediate_size": 4096,
"hidden_size": 1024,
"div_val": 4,
"is_dropout": true,
"adaptive_embedding": true,
"attention_probs_dropout_prob": 0.0,
"hidden_dropout_prob": 0.1,
"hidden_act": "relu",
"eos_token_id": 0,
"ext_len": 0,
"init": "normal",
"init_range": 0.01,
"init_std": 0.02,
"layer_norm_epsilon": 1e-05,
"mem_len": 1600,
"model_type": "transfo-xl",
"num_attention_heads": 16,
"num_hidden_layers": 18,
"pre_lnorm": false,
"proj_init_std": 0.01,
"same_length": true,
"sample_softmax": -1,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 250
}
},
"tgt_len": 128,
"tie_projs": [
false,
true,
true,
true
],
"tie_weight": true,
"untie_r": true,
"vocab_size": 267735
}
'''
\ No newline at end of file
#! -*- coding: utf-8 -*-
# bert做conditional language model任务
# 按类随机生成文本,这个demo的类别是情感极性(正/负)
# 请参考:https://kexue.fm/archives/7124
from pydantic import NoneStrBytes
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, text_segmentate, Callback, AutoRegressiveDecoder, ListDataset
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
# 模型配置
maxlen = 128
batch_size = 16
num_classes = 2
epochs = 20
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
# if len(D) >= 100:
# break
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(label)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids, batch_labels], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data',
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data',
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
c = nn.Embedding(num_classes, 128)
self.bert = build_transformer_model(config_path,
checkpoint_path,
with_mlm=True,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
layer_norm_cond=c,
ignore_invalid_weights=True) # 忽略未初始化的权重
def forward(self, inputs):
_, seq_output = self.bert(inputs) # [btz, seq_len, vocab_size]
return seq_output
model = Model().to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, input, target):
input = input[:, :-1, :].reshape(-1, input.shape[-1])
target = target[:, 1:].flatten()
return super().forward(input, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class RandomSentiment(AutoRegressiveDecoder):
"""根据情感标签(0:负,1:正)随机生成一批句子
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = output_ids
segment_ids = torch.zeros_like(token_ids, device=device)
label = inputs[0]
return model.predict([token_ids, segment_ids, label])[:, -1, :]
def generate(self, label, n=1, topp=0.95):
results = self.random_sample([[label]], n, topp=topp) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in results]
random_sentiment = RandomSentiment(
start_id=tokenizer._token_start_id,
end_id=tokenizer._token_end_id,
maxlen=maxlen,
device=device
)
def just_show():
print(u'正面采样:')
print(random_sentiment.generate(1, 5, 0.95), '\n')
print(u'负面采样:')
print(random_sentiment.generate(0, 5, 0.95), '\n')
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('./best_model.pt')
"""
正面采样:
[
u'外观时尚、漂亮、性价比高。',
u'外观漂亮,配置均衡,比较满意,性价比高,外观漂亮,性能较高。',
u'我是在大学的时候看到这本书的,所以一直在买。书中的作者是林静蕾,她用自己的口吻写出了一个孩子成长中的心路历程,让我看到了她们成长中的不同之处,以及她们成长过程中的不同境界。让我很欣赏!',
u'我想这是一本能够告诉读者什么是坏的,而不是教你怎样说话,告诉我什么是错。这里我推荐了《我要讲故事》,这本书是我很喜欢的一本书,我认为它的理由很多,但是,我相信我。如果你从中得到一些改进,或者你已经有了一个明智的决定。',
u'我们一家五口住的是标间,大床房,大床的床很舒服;而我们在携程网上订了两套大床房,这个酒店的价格还是比较合理的;但是房间的隔音效果不太理想,有点响的声音;酒店门口的地铁在施工中,不方便;但是酒店的门口的出租车不知道是哪个车的,打车不是很方便;酒店外面的停'
]
负面采样:
[
u'不知道是不是因为电池不太好,不是我不喜欢。',
u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
u'1、外壳不容易沾手印,不容易洗洗2、屏幕有点旧, 不能下载铃声',
u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款,为什么订单下了两周多至今还未到货?是收货时间太快了,可能就这么过去了吧?',
u'这本书我是在网上先看了一遍,后来我再看了一遍。感觉作者的文笔实在太烂了,特别是在写他的博客时特别别扭,写得很不专业,特别是他写股票时那个情绪调节的小男孩,简直就是自作聪明的样子,简直就是自作聪明的一种表现!'
]
"""
#! -*- coding:utf-8 -*-
# 自定义fit()训练过程
from itertools import cycle
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, ProgbarLogger
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 128
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
def fit(self, train_dataloader, steps_per_epoch, epochs=1):
'''自定义fit过程:适用于自带fit()不满足需求时,用于自定义训练过程
'''
# 实现进度条展示功能,不需要可以不用
bar = ProgbarLogger(epochs, steps_per_epoch, ['loss'])
global_step, epoch, best_val_acc = 0, 0, 0
train_dataloader = cycle(train_dataloader)
self.train()
for epoch in range(epochs):
bar.on_epoch_begin(epoch=epoch)
for bti in range(steps_per_epoch):
bar.on_batch_begin()
train_X, train_y = next(train_dataloader)
output = self.forward(*train_X)
loss = self.criterion(output, train_y)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
bar.on_batch_end(logs={'loss': loss.item()}) # 和上面定义bar时候一致
global_step += 1
bar.on_epoch_end()
# 评估
val_acc = evaluate(valid_dataloader)
if val_acc > best_val_acc:
best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {best_val_acc:.5f}\n')
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=100)
#! -*- coding:utf-8 -*-
# 文本分类例子下的模型压缩
# 方法为BERT-of-Theseus
# 论文:https://arxiv.org/abs/2002.02925
# 博客:https://kexue.fm/archives/7575
import json
from bert4torch.models import build_transformer_model, BaseModel, BERT
from bert4torch.snippets import sequence_padding, Callback, ListDataset
from bert4torch.tokenizers import Tokenizer
from bert4torch.layers import BertLayer
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torchinfo import summary
import copy
from torch.distributions.bernoulli import Bernoulli
num_classes = 119
maxlen = 128
batch_size = 32
replacing_rate = 0.5
steps_for_replacing = 2000
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""加载数据
单条格式: (文本, 标签id)
"""
D = []
with open(filename, encoding='utf-8') as f:
for i, l in enumerate(f):
l = json.loads(l)
text, label = l['sentence'], l['label']
D.append((text, int(label)))
return D
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 转换数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/train.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/dev.json'), batch_size=batch_size, collate_fn=collate_fn)
class BERT_THESEUS(BERT):
def __init__(self, **kwargs):
super().__init__(**kwargs)
layer = BertLayer(self.hidden_size, self.num_attention_heads, self.dropout_rate, self.attention_probs_dropout_prob, self.intermediate_size, self.hidden_act, is_dropout=False, conditional_size=self.conditional_size)
self.encoderLayer = nn.ModuleList(nn.ModuleList([copy.deepcopy(layer) for _ in range(self.num_hidden_layers)]))
self.scc_n_layer = 6 # 蒸馏到6层
self.scc_layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(self.scc_n_layer)])
self.compress_ratio = self.num_hidden_layers // self.scc_n_layer
self.bernoulli = None
def set_replacing_rate(self, replacing_rate):
if not 0 < replacing_rate <= 1:
raise Exception('Replace rate must be in the range (0, 1]!')
self.bernoulli = Bernoulli(torch.tensor([replacing_rate]))
def apply_main_layers(self, inputs):
"""BERT的主体是基于Self-Attention的模块
顺序:Att --> Add --> LN --> FFN --> Add --> LN
"""
hidden_states, attention_mask, conditional_emb = inputs
encoded_layers = [hidden_states] # 添加embedding的输出
if self.training:
inference_layers = []
for i in range(self.scc_n_layer):
if self.bernoulli.sample() == 1: # REPLACE
inference_layers.append(self.scc_layer[i])
else: # KEEP the original
for offset in range(self.compress_ratio):
inference_layers.append(self.encoderLayer[i * self.compress_ratio + offset])
else: # inference with compressed model
inference_layers = self.scc_layer
# forward
for i, layer_module in enumerate(inference_layers):
hidden_states = layer_module(hidden_states, attention_mask, conditional_emb)
if self.output_all_encoded_layers:
encoded_layers.append(hidden_states)
if not self.output_all_encoded_layers:
encoded_layers.append(hidden_states)
return [encoded_layers, conditional_emb]
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=BERT_THESEUS)
self.dense = nn.Linear(self.bert.configs['hidden_size'], num_classes)
def forward(self, token_ids, segment_ids):
encoded_layers = self.bert([token_ids, segment_ids])
output = self.dense(encoded_layers[:, 0, :]) # 取第1个位置
return output
model = Model().to(device)
summary(model, input_data=next(iter(train_dataloader))[0])
# replacing策略
class ConstantReplacementScheduler:
def __init__(self, bert_encoder, replacing_rate, replacing_steps=None):
self.bert_encoder = bert_encoder
self.replacing_rate = replacing_rate
self.replacing_steps = replacing_steps
self.step_counter = 0
self.bert_encoder.set_replacing_rate(replacing_rate)
def step(self):
self.step_counter += 1
if self.replacing_steps is None or self.replacing_rate == 1.0:
return self.replacing_rate
else:
if self.step_counter >= self.replacing_steps:
self.bert_encoder.set_replacing_rate(1.0)
self.replacing_rate = 1.0
return self.replacing_rate
class LinearReplacementScheduler:
def __init__(self, bert_encoder, base_replacing_rate, k):
self.bert_encoder = bert_encoder
self.base_replacing_rate = base_replacing_rate
self.step_counter = 0
self.k = k
self.bert_encoder.set_replacing_rate(base_replacing_rate)
def step(self):
self.step_counter += 1
current_replacing_rate = min(self.k * self.step_counter + self.base_replacing_rate, 1.0)
self.bert_encoder.set_replacing_rate(current_replacing_rate)
return current_replacing_rate
replacing_rate_scheduler = ConstantReplacementScheduler(bert_encoder=model.bert, replacing_rate=replacing_rate, replacing_steps=steps_for_replacing)
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), scheduler=replacing_rate_scheduler,
metrics=['accuracy'])
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, steps, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(u'val_acc: %.5f, best_val_acc: %.5f\n' %(val_acc, self.best_val_acc))
def predict_to_file(in_file, out_file):
"""输出预测结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw = open(out_file, 'w')
with open(in_file) as fr:
for l in tqdm(fr):
l = json.loads(l)
text = l['sentence']
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
label = model.predict([[token_ids], [segment_ids]])[0].argmax()
l = json.dumps({'id': str(l['id']), 'label': str(label)})
fw.write(l + '\n')
fw.close()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=50, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
#! -*- coding: utf-8 -*-
# bert做language model任务,小说生成
import glob, re
from tqdm import tqdm
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, Callback, ListDataset
import torch
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
maxlen = 256
batch_size = 8
epochs = 10000
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
novels = []
for txt in glob.glob(filenames):
txt = open(txt, encoding='utf-8').read()
txt = txt.replace('\r', '').replace('\n', '')
txt = txt.replace(u'整理制作,并提供下载', '')
txt = re.sub(u'www.*?com', '', txt)
txt = txt.replace(u'\u3000', ' ')
sents = []
for t in txt.split(' '):
for s in re.findall(u'.*?。', t):
if len(s) <= maxlen - 2:
sents.append(s)
novels.append(sents)
data = []
pbar = tqdm(desc=u'构建语料中', total=sum(len(n) for n in novels))
for novel in novels:
s = u''
for i in range(len(novel)):
for j in range(len(novel) - i):
if len(s) + len(novel[i + j]) > maxlen - 2:
data.append(s)
s = u''
break
else:
s += novel[i + j]
pbar.update(1)
if i + j >= len(novel):
break
if s:
data.append(s)
pbar.close()
return data
def collate_fn(batch):
batch_token_ids, batch_segment_ids = [], []
for text in batch:
token_ids, segment_ids = tokenizer.encode(text)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/pretrain/金庸小说/*.txt'),
batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 建模
model = build_transformer_model(
config_path,
checkpoint_path,
with_mlm=True,
application='lm',
keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表
).to(device)
summary(model, input_data=[next(iter(train_dataloader))[0]])
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
_, mlm_scores = outputs
mlm_scores = mlm_scores[:, :-1, :].reshape(-1, mlm_scores.shape[-1])
target = target[:, 1:].flatten()
return super().forward(mlm_scores, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
# 随机采样
class StoryCompletion(AutoRegressiveDecoder):
"""基于随机采样的故事续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = inputs[0]
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.zeros_like(token_ids, device=device)
_, mlm_scores = model.predict([token_ids, segment_ids])
return mlm_scores[:, -1, :]
def generate(self, text, n=1, topp=0.95):
token_ids, _ = tokenizer.encode(text)
results = self.random_sample([token_ids[:-1]], n, topp=topp) # 基于随机采样
return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
story_completion = StoryCompletion(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def just_show():
s1 = u'当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。'
s2 = u'虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。'
s3 = u'杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。'
for s in [s1, s2, s3]:
t = story_completion.generate(s)
print(u'输入: %s' % s)
print(u'结果: %s\n' % ('\n'.join(t)))
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.lowest = 1e10
def on_epoch_end(self, steps, epoch, logs=None):
# 保存最优
if logs['loss'] <= self.lowest:
self.lowest = logs['loss']
# model.save_weights('./best_model.pt')
# 演示效果
just_show()
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=100, callbacks=[evaluator])
else:
model.load_weights('./best_model.weights')
"""
效果:
输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。
结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。次日清晨,张无忌便和赵敏去买了一匹高头大马,自己骑了随伴。那马甚有神骏,三十六斤重的身躯之中,竟无一头白马。他心中怦怦乱跳,暗想:若能将赵敏引出迷城,我决不致再和她相会,但若和赵姑娘相遇,我一生一世决计再难相见。何况我是她的私生女儿,这般亲热,岂不是好?我如何能和她相见?今后我要教训教训她才好?我教教她,教训她,要她心里快快活活的。他心如刀割,当即回到客店,将张无忌的所在说了。
输入: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。
结果: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。那矮子见他如此功力,大吃一惊,叫道:什么人?是谁?你干什么?我师父是谁?你们是谁?是谁?你们是谁?我师父是谁?你这矮子,便是段延庆。你们不知道我师父便是,是不是?快快说来。那矮子道:我师父便是延庆太子,他的徒弟也是段延庆。他老人家在唐朝做镇南王,你们便将他改名为延庆太子,叫做延庆太子!这名头倒怪,你们大伙儿听见了,也不知道他老人家是死是活。
输入: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。
结果: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。这时见他手中所握,竟是一柄特制的短剑,心中大喜,叫道::原来是金蛇郎君的剑!原来你便是金蛇郎君的弟子,这一下可要叫我失望了。那人哈哈一笑,说道:好啊!好啊,好啊!我的金蛇剑是我的,不过我是你的。这人道:我姓杨名过,名字叫过。你是我儿子,是我女儿,是不是?你这么大的年纪,怎地自称金刀驸马?我这就给你取个名字,叫作过儿。
"""
#! -*- coding: utf-8 -*-
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 数据:https://github.com/bojone/gpt_cchess
# 模型训练可以在python2/python3进行。但是cchess模块只支持python3,
# 因此如果需要交互式体验模型棋力,那么需要在python3下进行。
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_roberta_chess.py
import json
import numpy as np
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from bert4torch.snippets import sequence_padding, ListDataset, Callback
from cchess import *
# 基本信息
maxlen = 512
steps_per_epoch = 1000
epochs = 10000
batch_size = 16
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
"""读取全局棋谱
返回:[(棋谱, 结果)],其中结果等于2为红方赢棋,1为和棋,
0为黑方赢棋,-1则为无明确标注胜负。
"""
D = []
with open(filename, encoding='utf-8') as f:
for l in f:
l = json.loads(l)
if not l['fen']:
result = int(l['items'].get(u'棋局结果', -1))
D.append((l['iccs'], result))
return D
# 建立分词器
chars = [u'[PAD]'] + list(u'0123456789abcdefghi')
token_dict = dict(zip(chars, range(len(chars))))
tokenizer = Tokenizer(token_dict)
tokenizer._token_unk_id = 0
bert_token_dict = load_vocab(dict_path)
keep_tokens = [bert_token_dict[c] for c in chars]
count = 0
def get_count():
if count < 20000:
n = 8
elif count < 40000:
n = 4
elif count < 80000:
n = 2
else:
n = 1
return n
def collate_fn(batch):
"""数据生成器
"""
batch_token_ids, batch_segment_ids = [], []
for text, _ in batch:
token_ids, segment_ids = tokenizer.encode(' '.join(text), maxlen=maxlen // get_count() + 1)
batch_token_ids.append([0] + token_ids[1:-1])
batch_segment_ids.append([0] + segment_ids[1:-1])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
global count
count += 1
return [batch_token_ids, batch_segment_ids], batch_token_ids
# 加载数据集
train_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/qipu/qipu.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 由于字典中0不代表padding位,为避免attention_mask计算错误,这里token_pad_ids=-100
model = build_transformer_model(config_path, checkpoint_path, application='lm', with_mlm=True,
keep_tokens=keep_tokens, token_pad_ids=-100).to(device)
class CrossEntropyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, outputs, target):
_, mlm_scores = outputs
mlm_scores = mlm_scores[:, :-1, :].reshape(-1, mlm_scores.shape[-1])
target = target[:, 1:].flatten()
return super().forward(mlm_scores, target)
model.compile(loss=CrossEntropyLoss(ignore_index=0), optimizer=optim.Adam(model.parameters(), 1e-5))
class ChessPlayer(object):
"""交互式下棋程序
"""
def move_to_chinese(self, move):
"""将单步走法转为中文描述
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_chinese()
def move_to_iccs(self, move):
"""将单步走法转为iccs表示
"""
if not isinstance(move, Move):
move = Move(self.board, move[0], move[1])
return move.to_iccs()
def print_board(self):
"""打印当前棋盘
直观起见,红方用红色表示,黑方用绿色表示。
"""
for l in self.board.dump_board():
for c in u'兵炮车马相仕帅':
l = l.replace(c, u'\033[1;31;40m%s\033[0m' % c)
for c in u'卒砲砗碼象士将':
l = l.replace(c, u'\033[1;32;40m%s\033[0m' % c)
print(l)
def movable_steps(self):
"""给出当前局面所有候选走法
"""
return [self.move_to_iccs(m) for m in self.board.create_moves()]
def human_input(self):
"""人类行棋
"""
while True:
try:
iccs = input(u'请输入iccs棋着: ')
print(iccs)
move = self.board.move_iccs(iccs)
if move is not None:
return iccs, move
except KeyboardInterrupt:
return None
except:
pass
def record(self, iccs):
"""将局面往前推进一步
"""
self.history += iccs
self.board.next_turn()
self.print_board()
self.current = (self.current + 1) % 2
def new_game(self, current=0):
"""开新局
"""
self.board = ChessBoard()
self.board.from_fen(FULL_INIT_FEN)
self.print_board()
self.history = ''
self.current = current
if self.current == 0: # 人类先手
iccs, move = self.human_input()
self.record(iccs)
while True:
# 机器走棋
moves = self.movable_steps()
iccses = [' '.join(self.history + m) for m in moves]
token_ids = [[0] + tokenizer.encode(ic)[0][1:-1] for ic in iccses]
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)
segment_ids = torch.zeros_like(token_ids)
preds = model.predict([token_ids, segment_ids])[-1][:, -5:-1]
preds = nn.Softmax(dim=-1)(preds)
preds = torch.take_along_dim(preds, token_ids[:, -4:, None], dim=2)
preds = torch.log(preds + 1e-8)[:, :, 0].sum(dim=1)
iccs = moves[preds.argmax()]
move = self.board.move_iccs(iccs)
self.record(iccs)
if self.board.is_win():
print(u'机器赢了')
break
# 人类走棋
iccs, move = self.human_input()
self.record(iccs)
if self.board.is_win():
print(u'人类赢了')
break
chessplayer = ChessPlayer()
class Evaluator(Callback):
"""评估与保存
"""
def on_epoch_end(self, global_step, epoch, logs=None):
# 保存模型
# model.save_weights('./best_model_chess.pt')
pass
if __name__ == '__main__':
choice = 'eval'
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, steps_per_epoch=1000, epochs=20, callbacks=[evaluator])
else:
model.load_weights('./best_model_chess.pt')
chessplayer.new_game(0) # 启动新棋局,0为人类先手,1为机器先手
#! -*- coding:utf-8 -*-
# 调用transformers库中的模型来调用
# 本脚本演示功能为主,实际训练建议两者取其一
# 少量可能使用到的场景:
# 1)bert4torch的fit过程可以轻松使用对抗训练,梯度惩罚,虚拟对抗训练等功能
# 2)就是临时直接用transformers库里面的模型文件
# 3)写代码时候用于校验两者结果
from transformers import AutoModelForSequenceClassification
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 128
batch_size = 16
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = AutoModelForSequenceClassification.from_pretrained("F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12", num_labels=2)
def forward(self, token_ids, segment_ids):
output = self.bert(input_ids=token_ids, token_type_ids=segment_ids)
return output.logits
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=100, grad_accumulation_steps=2, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding: utf-8 -*-
# 预训练脚本,单GPU版方便测试
# 改DDP需几行代码,参考https://github.com/Tongjilibo/bert4torch/blob/master/examples/training_trick/task_distributed_data_parallel.py
from bert4torch.models import build_transformer_model
from bert4torch.snippets import sequence_padding, Callback
from bert4torch.optimizers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import json
import os
import shelve
import random
import time
# 语料路径和模型保存路径
model_saved_path = './bert_model.ckpt'
dir_training_data = 'E:/Github/bert4torch/examples/datasets/pretrain' # dir_training_data
task_name = 'roberta'
# 其他配置
maxlen = 512
batch_size = 7
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin' # 如果从零训练,就设为None
learning_rate = 0.00176
weight_decay_rate = 0.01 # 权重衰减
num_warmup_steps = 3125
num_train_steps = 125000
steps_per_epoch = 10000
grad_accum_steps = 16 # 大于1即表明使用梯度累积
epochs = num_train_steps * grad_accum_steps // steps_per_epoch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 读取数据集,构建数据张量
class MyDataset(Dataset):
def __init__(self, file):
super(MyDataset, self).__init__()
self.file = file
self.len = self._get_dataset_length()
self.db = self._load_data()
def __getitem__(self, index):
return self.db[str(index)]
def __len__(self):
return self.len
def _get_dataset_length(self):
file_record_info = self.file + ".json"
record_info = json.load(open(file_record_info, "r", encoding="utf-8"))
return record_info["samples_num"]
def _load_data(self):
return shelve.open(self.file)
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for item in batch:
batch_token_ids.append(item['input_ids'])
batch_labels.append(item['masked_lm_labels'])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids], batch_labels
# 从语料文件夹中随机选取一个文件,生成dataloader
def get_train_dataloader():
while True:
# prepare dataset
files_training_data = os.listdir(dir_training_data)
files_training_data = [file.split(".")[0] for file in files_training_data if "train" in file]
# 防止使用到正在生成的文件
files_training_data = [i for i in set(files_training_data) if files_training_data.count(i)==4]
if files_training_data:
file_train = random.choice(files_training_data)
for suffix in [".bak", ".dat", ".dir", ".json"]:
file_old = os.path.join(dir_training_data, file_train + suffix)
file_new = os.path.join(dir_training_data, task_name + suffix)
os.renames(file_old, file_new)
cur_load_file = file_new.split(".")[0]
train_dataloader = DataLoader(MyDataset(cur_load_file), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
break
else:
print("No training data! Sleep 300s!")
time.sleep(10)
continue
return train_dataloader
train_dataloader = get_train_dataloader()
model = build_transformer_model(config_path, checkpoint_path, segment_vocab_size=0, with_mlm=True).to(device)
# weight decay
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay_rate},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
class MyLoss(nn.CrossEntropyLoss):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def forward(self, output, batch_labels):
y_preds = output[-1]
y_preds = y_preds.reshape(-1, y_preds.shape[-1])
return super().forward(y_preds, batch_labels.flatten())
# 定义使用的loss和optimizer,这里支持自定义
optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate, weight_decay=weight_decay_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
model.compile(loss=MyLoss(ignore_index=0), optimizer=optimizer, scheduler=scheduler)
class ModelCheckpoint(Callback):
"""自动保存最新模型
"""
def on_dataloader_end(self, logs=None):
# 在dataloader结束的时候,关闭db并且删除训练的文件
model.train_dataloader.dataset.db.close()
for suffix in [".bak", ".dat", ".dir", ".json"]:
file_remove = os.path.join(dir_training_data, task_name + suffix)
try:
os.remove(file_remove)
except:
print(f"Failed to remove training data {file_remove}.")
# 重新生成dataloader
model.train_dataloader = get_train_dataloader()
def on_epoch_end(self, global_step, epoch, logs=None):
model.save_weights(model_saved_path)
if __name__ == '__main__':
# 保存模型
checkpoint = ModelCheckpoint()
# 模型训练
model.fit(
train_dataloader,
steps_per_epoch=steps_per_epoch,
grad_accumulation_steps=grad_accum_steps,
epochs=epochs,
callbacks=[checkpoint],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment