Commit 92c75df1 authored by sunzhq2's avatar sunzhq2
Browse files

yidong infer init

parents
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT模型,base版本,CDial-GPT版
# 项目链接:https://github.com/thu-coai/CDial-GPT
# 参考项目:https://github.com/bojone/CDial-GPT-tf
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import torch
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import AutoRegressiveDecoder
config_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器
speakers = [tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]')]
# config中设置shared_segment_embeddings=True,segment embedding用word embedding的权重生成
model = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
model='gpt',
).to(device) # 建立模型,加载权重
class ChatBot(AutoRegressiveDecoder):
"""基于随机采样的闲聊回复
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
curr_segment_ids = torch.zeros_like(output_ids) + token_ids[0, -1]
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, curr_segment_ids], 1)
logits = model.predict([token_ids, segment_ids])
return logits[:, -1, :]
def response(self, texts, n=1, topk=5):
token_ids = [tokenizer._token_start_id, speakers[0]]
segment_ids = [tokenizer._token_start_id, speakers[0]]
for i, text in enumerate(texts):
ids = tokenizer.encode(text)[0][1:-1] + [speakers[(i + 1) % 2]]
token_ids.extend(ids)
segment_ids.extend([speakers[i % 2]] * len(ids))
segment_ids[-1] = speakers[(i + 1) % 2]
results = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return tokenizer.decode(results[0].cpu().numpy())
chatbot = ChatBot(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, device=device)
print(chatbot.response([u'别爱我没结果', u'你这样会失去我的', u'失去了又能怎样']))
"""
回复是随机的,例如:你还有我 | 那就不要爱我 | 你是不是傻 | 等等。
"""
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 基础测试:GAU_alpha的mlm预测,和bert4keras版本比对一致
# 测试中长文本效果明显高于短文本效果
# 博客:https://kexue.fm/archives/9052
# 权重转换脚本:./convert_script/convert_GAU_alpha.py
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
# 加载模型,请更换成自己的路径
config_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, model='gau_alpha', with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("近期正是上市公司财报密集披露的时间,但有多家龙头公司的业绩令投资者失望")
token_ids[5] = token_ids[6] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数with_mlm
model.eval()
with torch.no_grad():
_, probas = model([tokens_ids_tensor, segment_ids_tensor])
result = torch.argmax(probas[0, 5:7], dim=-1).numpy()
print(tokenizer.decode(result))
# 测试bart语言模型的预测效果
# bert4torch需要转换一下权重,见convert文件夹中
from transformers import BertTokenizer, BartForConditionalGeneration
tokenizer = BertTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/")
model = BartForConditionalGeneration.from_pretrained("F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/")
input_ids = tokenizer.encode("北京是[MASK]的首都", return_tensors='pt')
pred_ids = model.generate(input_ids, num_beams=4, max_length=20)
print('transformers output: ', tokenizer.convert_ids_to_tokens(pred_ids[0]))
# 输出: ['[SEP]', '[CLS]', '北', '京', '是', '中', '国', '的', '首', '都', '[SEP]']
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import AutoRegressiveDecoder
import torch
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, model='bart', segment_vocab_size=0).to(device)
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=4):
token_ids, _ = tokenizer.encode(text, maxlen=128)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=102, end_id=tokenizer._token_end_id, maxlen=32, device=device)
print('bert4torch output: ', autotitle.generate("北京是[MASK]的首都"))
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 基础测试:mlm预测
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
# 加载模型,请更换成自己的路径
root_model_path = "F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/bert_config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("科学技术是第一生产力")
token_ids[3] = token_ids[4] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数with_mlm
model.eval()
with torch.no_grad():
_, probas = model([tokens_ids_tensor, segment_ids_tensor])
result = torch.argmax(probas[0, 3:5], dim=-1).numpy()
print(tokenizer.decode(result))
#! -*- coding: utf-8 -*-
# 基本测试:清华开源的中文GPT2模型(26亿参数)
# 项目链接:https://github.com/TsinghuaAI/CPM-Generate
# 博客介绍:https://kexue.fm/archives/7912
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import numpy as np
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import SpTokenizer
from bert4torch.snippets import AutoRegressiveDecoder
import torch
import jieba
jieba.initialize()
# 模型路径
config_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_pytorch_model.bin'
spm_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/chinese_vocab.model'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def pre_tokenize(text):
"""分词前处理函数,'\n'替换成'▃', ' '替换成'▂'
"""
return [
w.replace(' ', u'\u2582').replace('\n', u'\u2583')
for w in jieba.cut(text, cut_all=False)
]
tokenizer = SpTokenizer(
spm_path,
token_start=None,
token_end=None,
pre_tokenize=pre_tokenize,
token_translate={u'\u2583': '<cls>'} # '\n'替换成<cls>
) # 建立分词器
model = build_transformer_model(
config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2', segment_vocab_size=0
).to(device) # 建立模型,加载权重
class TextExpansion(AutoRegressiveDecoder):
"""基于随机采样的文本续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids = torch.cat([inputs[0], output_ids], 1)
logits = model.predict([token_ids])
return logits[:, -1, :]
def generate(self, text, n=1, topp=0.95, temperature=1):
"""输出结果会有一定的随机性,如果只关心Few Shot效果,
可以考虑将解码方式换为beam search。
"""
token_ids, _ = tokenizer.encode(text)
results = self.random_sample([token_ids], n, topp=topp, temperature=temperature) # 基于随机采样
results = [token_ids + [int(i) for i in ids.cpu().numpy()] for ids in results]
texts = [tokenizer.decode(ids) for ids in results]
return [self.post_replace(text) for text in texts]
def post_replace(self, text):
for s, t in [(' ', ''), (u'\u2582', ' '), (u'\u2583', '\n')]:
text = text.replace(s, t)
return text
text_expansion = TextExpansion(
start_id=None,
end_id=3, # 3是<cls>,也是换行符
maxlen=16,
device=device
)
# 常识推理
# 本例输出:北京
query = u"""
美国的首都是华盛顿
法国的首都是巴黎
日本的首都是东京
中国的首都是
"""
print(text_expansion.generate(query[1:-1], 1)[0])
# 单词翻译
# 本例输出:bird
query = u"""
狗 dog
猫 cat
猪 pig
"""
print(text_expansion.generate(query[1:-1], 1)[0])
# 主语抽取
# 本例输出:杨振宁
query = u"""
从1931年起,华罗庚在清华大学边学习边工作 华罗庚
在一间简陋的房间里,陈景润攻克了“哥德巴赫猜想” 陈景润
在这里,丘成桐得到IBM奖学金 丘成桐
杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献
"""
print(text_expansion.generate(query[1:-1], 1)[0])
# 三元组抽取
# 本例输出:张红,体重,140斤
query = u"""
姚明的身高是211cm,是很多人心目中的偶像。 ->姚明,身高,211cm
毛泽东是绍兴人,早年在长沙读书。->毛泽东,出生地,绍兴
虽然周杰伦在欧洲办的婚礼,但是他是土生土长的中国人->周杰伦,国籍,中国
小明出生于武汉,但是却不喜欢在武汉生成,长大后去了北京。->小明,出生地,武汉
吴亦凡是很多人的偶像,但是他却是加拿大人,另很多人失望->吴亦凡,国籍,加拿大
武耀的生日在5月8号,这一天,大家都为他庆祝了生日->武耀,生日,5月8号
《青花瓷》是周杰伦最得意的一首歌。->周杰伦,作品,《青花瓷》
北京是中国的首都。->中国,首都,北京
蒋碧的家乡在盘龙城,毕业后去了深圳工作。->蒋碧,籍贯,盘龙城
上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立,籍贯,云南
昨天11月17号,我和朋友一起去了海底捞,期间服务员为我的朋友刘章庆祝了生日。->刘章,生日,11月17号
张红的体重达到了140斤,她很苦恼。->
"""
print(text_expansion.generate(query[1:-1], 1)[0])
#! -*- coding: utf-8 -*-
# 基础测试:ERNIE模型测试
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
# 加载模型,请更换成自己的路径
root_model_path = "F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-1-base-zh"
# root_model_path = "F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-3-base-zh"
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, model='ERNIE', with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("科学技术是第一生产力")
token_ids[3] = token_ids[4] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数
model.eval()
with torch.no_grad():
_, probas = model([tokens_ids_tensor, segment_ids_tensor])
result = torch.argmax(probas[0, 3:5], dim=-1).numpy()
print(tokenizer.decode(result))
#! -*- coding: utf-8 -*-
# 基本测试:gpt2_ml的效果测试
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import torch
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import AutoRegressiveDecoder
config_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, token_start=None, token_end=None, do_lower_case=True) # 建立分词器
model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml', segment_vocab_size=0).to(device) # 建立模型,加载权重
class ArticleCompletion(AutoRegressiveDecoder):
"""基于随机采样的文章续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='probas')
def predict(self, inputs, output_ids, states):
token_ids = torch.cat([inputs[0], output_ids], 1)
logits = model.predict([token_ids])
return logits[:, -1, :]
def generate(self, text, n=1, topp=0.95):
token_ids, _ = tokenizer.encode(text)
results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样
return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
article_completion = ArticleCompletion(
start_id=None,
end_id=511, # 511是中文句号
maxlen=256,
minlen=128,
device=device
)
for text in [u'今天天气不错', u'双十一', u'科学空间']:
print(article_completion.generate(text))
"""
部分结果:
>>> article_completion.generate(u'今天天气不错')
[u'今天天气不错。昨天的天气是多云到晴的天气,今天的天气还不错,不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热,最高温度在30℃左右,明后两天天气会更加热。预计今天的最高温度为30℃,明后两天的最 高温度为32℃左右,今天的最高气温将在30℃左右。(记者李莉)。新华网重庆频道诚邀广大网友投稿,您可以用相机或手机记录下身边的感人故事,精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们,我们将精选其中的好图、美图在页面上展示,让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年,重庆市各级公安机关在全力抓好']
>>> article_completion.generate(u'双十一')
[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
>>> article_completion.generate(u'科学空间')
[u'科学空间站上的两个机器人在进入轨道后,一边在轨道上工作,一边用它们的身体和心脏在空间站上的一个大气层进行活动,以确保它们在进入地球之后不会因太阳风暴而受到影响;而另外一个机器人则在进入轨道的过程中,通 过机器人与地球上的大气层相互作用,使地球的大气层不断地向地球的大气层中转移,以使其能够在空间站上工作,并且使用它们的身体和心脏来完成它们的各种任务。']
"""
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT模型,base版本,华为开源的
# 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn,这里使用的是转pytorch后的模型文件
# 参考项目:https://github.com/bojone/chinese-gen
import torch
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import AutoRegressiveDecoder
config_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器
model = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
segment_vocab_size=0, # 去掉segmeng_ids输入
application='lm',
).to(device) # 建立模型,加载权重
class ArticleCompletion(AutoRegressiveDecoder):
"""基于随机采样的文章续写
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = torch.cat([inputs[0], output_ids], 1)
_, mlm_scores = model.predict([token_ids])
return mlm_scores[:, -1, :]
def generate(self, text, n=1, topp=0.95):
token_ids = tokenizer.encode(text)[0][:-1]
results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样
return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
article_completion = ArticleCompletion(
start_id=None,
end_id=511, # 511是中文句号
maxlen=256,
minlen=128,
device=device
)
print(article_completion.generate(u'今天天气不错'))
"""
部分结果:
>>> article_completion.generate(u'今天天气不错')
[u'今天天气不错。昨天的天气是多云到晴的天气,今天的天气还不错,不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热,最高温度在30℃左右,明后两天天气会更加热。预计今天的最高温度为30℃,明后两天的最 高温度为32℃左右,今天的最高气温将在30℃左右。(记者李莉)。新华网重庆频道诚邀广大网友投稿,您可以用相机或手机记录下身边的感人故事,精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们,我们将精选其中的好图、美图在页面上展示,让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年,重庆市各级公安机关在全力抓好']
>>> article_completion.generate(u'双十一')
[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
>>> article_completion.generate(u'科学空间')
[u'科学空间站上的两个机器人在进入轨道后,一边在轨道上工作,一边用它们的身体和心脏在空间站上的一个大气层进行活动,以确保它们在进入地球之后不会因太阳风暴而受到影响;而另外一个机器人则在进入轨道的过程中,通 过机器人与地球上的大气层相互作用,使地球的大气层不断地向地球的大气层中转移,以使其能够在空间站上工作,并且使用它们的身体和心脏来完成它们的各种任务。']
"""
#! -*- coding: utf-8 -*-
# NEZHA模型做闲聊任务,这里只提供了测试脚本
# 源项目:https://github.com/bojone/nezha_gpt_dialog
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_nezha_gpt_dialog.py
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
from bert4torch.snippets import AutoRegressiveDecoder
import torch
# nezha配置
config_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/vocab.txt'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 建立并加载模型
model = build_transformer_model(
config_path,
checkpoint_path,
model='nezha',
application='lm',
)
class ChatBot(AutoRegressiveDecoder):
"""基于随机采样对话机器人
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.concat([token_ids, output_ids], 1)
curr_segment_ids = torch.ones_like(output_ids) - segment_ids[0, -1]
segment_ids = torch.concat([segment_ids, curr_segment_ids], 1)
return model.predict([token_ids, segment_ids])[-1][:, -1]
def response(self, texts, topk=5):
token_ids, segment_ids = [tokenizer._token_start_id], [0]
for i, text in enumerate(texts):
ids = tokenizer.encode(text)[0][1:]
token_ids.extend(ids)
segment_ids.extend([i % 2] * len(ids))
results = self.random_sample([token_ids, segment_ids], 1, topk)
return tokenizer.decode(results[0].cpu().numpy())
chatbot = ChatBot(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
print(chatbot.response([u'别爱我没结果', u'你这样会失去我的', u'失去了又能怎样']))
"""
回复是随机的,例如:那你还爱我吗 | 不知道 | 爱情是不是不能因为一点小事就否定了 | 我会一直爱你,你一个人会很辛苦 | 等等。
"""
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 基础测试:mlm测试roformer、roformer_v2模型
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer
import torch
choice = 'roformer_v2' # roformer roformer_v2
if choice == 'roformer':
args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/"
args_model = 'roformer'
else:
args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/"
args_model = 'roformer_v2'
# 加载模型,请更换成自己的路径
root_model_path = args_model_path
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path, model=args_model, with_mlm='softmax') # 建立模型,加载权重
token_ids, segments_ids = tokenizer.encode("今天M很好,我M去公园玩。")
token_ids[3] = token_ids[8] = tokenizer._token_mask_id
print(''.join(tokenizer.ids_to_tokens(token_ids)))
tokens_ids_tensor = torch.tensor([token_ids])
segment_ids_tensor = torch.tensor([segments_ids])
# 需要传入参数with_mlm
model.eval()
with torch.no_grad():
_, logits = model([tokens_ids_tensor, segment_ids_tensor])
pred_str = 'Predict: '
for i, logit in enumerate(logits[0]):
if token_ids[i] == tokenizer._token_mask_id:
pred_str += tokenizer.id_to_token(torch.argmax(logit, dim=-1).item())
else:
pred_str += tokenizer.id_to_token(token_ids[i])
print(pred_str)
#! -*- coding: utf-8 -*-
# SimBERT/RoFormer-Sim测试相似问生成效果,以及句子之间相似度效果
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import torch
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, get_pool_emb
from bert4torch.tokenizers import Tokenizer, load_vocab
# 基本信息
maxlen = 32
choice = 'simbert_v2' # simbert simbert_v2
if choice == 'simbert':
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base"
args_model = 'bert'
else:
args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base"
args_model = 'roformer'
# 加载simbert权重或roformer_v2
root_model_path = args_model_path
dict_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
# 建立加载模型
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear', model=args_model,
application='unilm', keep_tokens=keep_tokens)
self.pool_method = pool_method
def forward(self, token_ids, segment_ids):
hidden_state, pooler, seq_logit = self.bert([token_ids, segment_ids])
sen_emb = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
return seq_logit, sen_emb
model = Model(pool_method='cls').to(device)
class SynonymsGenerator(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps('logits')
def predict(self, inputs, output_ids, states):
token_ids, segment_ids = inputs
token_ids = torch.cat([token_ids, output_ids], 1)
segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
seq_logit, _ = model.predict([token_ids, segment_ids])
return seq_logit[:, -1, :]
def generate(self, text, n=1, topk=5):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
output_ids = self.random_sample([token_ids, segment_ids], n, topk) # 基于随机采样
return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
def cal_sen_emb(text_list):
'''输入text的list,计算sentence的embedding
'''
X, S = [], []
for t in text_list:
x, s = tokenizer.encode(t)
X.append(x)
S.append(s)
X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
_, Z = model.predict([X, S])
return Z
def gen_synonyms(text, n=100, k=20):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r = synonyms_generator.generate(text, n)
r = [i for i in set(r) if i != text] # 不和原文相同
r = [text] + r
Z = cal_sen_emb(r)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
argsort = torch.matmul(Z[1:], -Z[0]).argsort()
return [r[i + 1] for i in argsort[:k]]
if __name__ == '__main__':
choice = 'generate' # generate similarity
if choice == 'generate':
print(gen_synonyms('我想去北京玩玩可以吗', 10, 10))
elif choice == 'similarity':
target_text = '我想去首都北京玩玩'
text_list = ['我想去北京玩', '北京有啥好玩的吗?我想去看看', '好渴望去北京游玩啊']
Z = cal_sen_emb([target_text]+text_list)
Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
similarity = torch.matmul(Z[1:], Z[0])
for i, line in enumerate(text_list):
print(f'cos_sim: {similarity[i].item():.4f}, tgt_text: "{target_text}", cal_text: "{line}"')
else:
model.load_weights('./best_model.pt')
#! -*- coding: utf-8 -*-
# 调用T5 PEGASUS, 使用到是BertTokenizer
import torch
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import AutoRegressiveDecoder
import jieba
jieba.initialize()
# bert配置
# pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/'
pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path = pretrain_model + 'config.json'
checkpoint_path = pretrain_model + 'pytorch_model.bin'
dict_path = pretrain_model + 'vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
tokenizer = Tokenizer(
dict_path,
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False)
)
model = build_transformer_model(
config_path,
checkpoint_path,
model='mt5.1.1',
segment_vocab_size=0
).to(device)
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1):
token_ids, _ = tokenizer.encode(text, maxlen=256)
token_ids = torch.tensor([token_ids], device=device)
encoder_output = model.encoder.predict([token_ids])
output_ids = self.beam_search(encoder_output, topk=topk) # 基于beam search
return tokenizer.decode([int(i) for i in output_ids.cpu().numpy()])
autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=tokenizer._token_end_id, maxlen=32, device=device) # 这里end_id可以设置为tokenizer._token_end_id这样结果更短
if __name__ == '__main__':
print(autotitle.generate('今天天气不错啊'))
# small版输出:我是个女的,我想知道我是怎么想的
# base版输出:请问明天的天气怎么样啊?
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 调用transformer_xl模型,该模型流行度较低,未找到中文预训练模型
# last_hidden_state目前是debug到transformer包中查看,经比对和本框架一致
# 用的是transformer中的英文预训练模型来验证正确性
# 转换脚本: convert_script/convert_transformer_xl.py
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
pretrained_model = "F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103"
# ----------------------transformers包----------------------
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
model = AutoModelForCausalLM.from_pretrained(pretrained_model)
model.eval()
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
# 这里只能断点进去看
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.losses
print('transforms loss: ', loss)
# ----------------------bert4torch配置----------------------
from bert4torch.models import build_transformer_model
config_path = f'{pretrained_model}/bert4torch_config.json'
checkpoint_path = f'{pretrained_model}/bert4torch_pytorch_model.bin'
model = build_transformer_model(
config_path,
checkpoint_path=checkpoint_path,
model='transformer_xl',
)
print('bert4torch last_hidden_state: ', model.predict([inputs['input_ids']]))
# tensor([[[ 0.1027, 0.0604, -0.2585, ..., 0.3137, -0.2679, 0.1036],
# [ 0.3482, -0.0458, -0.4582, ..., 0.0242, -0.0721, 0.2311],
# [ 0.3426, -0.1353, -0.4145, ..., 0.1123, 0.1374, 0.1313],
# [ 0.0038, -0.0978, -0.5570, ..., 0.0487, -0.1891, -0.0608],
# [-0.2155, -0.1388, -0.5549, ..., -0.1458, 0.0774, 0.0419],
# [ 0.0967, -0.1781, -0.4328, ..., -0.1831, -0.0808, 0.0890]]])
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 调用预训练的t5-chinese模型直接做预测,使用的BertTokenizer
# t5使用的是t5.1.0的结构
import torch
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
from bert4torch.snippets import AutoRegressiveDecoder
# bert配置
config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/vocab.txt'
# config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
# checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
# dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 加载并精简词表,建立分词器
token_dict = load_vocab(
dict_path=dict_path,
simplified=False,
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
model = build_transformer_model(
config_path,
checkpoint_path,
model='t5.1.0',
segment_vocab_size=0
).to(device)
class AutoTitle(AutoRegressiveDecoder):
"""seq2seq解码器
"""
@AutoRegressiveDecoder.wraps(default_rtype='logits')
def predict(self, inputs, output_ids, states):
token_ids = inputs[0]
return model.predict([[token_ids], [output_ids]])[-1][:, -1, :] # 保留最后一位
def generate(self, text, topk=1, topp=0.95):
token_ids, _ = tokenizer.encode(text, maxlen=256)
output_ids = self.beam_search([token_ids], topk=topk) # 基于beam search
return tokenizer.decode(output_ids.cpu().numpy())
autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=1, maxlen=32, device=device) # 这里end_id可以设置为tokenizer._token_end_id这样结果更短
if __name__ == '__main__':
print(autotitle.generate('中国的首都是extra0京'))
from transformers import XLNetTokenizer, XLNetModel
import torch
pretrained_model = "F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base"
tokenizer = XLNetTokenizer.from_pretrained(pretrained_model)
model = XLNetModel.from_pretrained(pretrained_model)
inputs = tokenizer(["你好啊,我叫张三", "天气不错啊"], padding=True, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print('--------transformers last_hidden_state--------\n', last_hidden_states)
# ----------------------bert4torch配置----------------------
from bert4torch.models import build_transformer_model
config_path = f'{pretrained_model}/bert4torch_config.json'
checkpoint_path = f'{pretrained_model}/pytorch_model.bin'
model = build_transformer_model(
config_path,
checkpoint_path=checkpoint_path,
model='xlnet',
# with_lm=True
token_pad_ids=tokenizer.pad_token_id,
)
print('--------bert4torch last_hidden_state--------\n', model.predict([inputs['input_ids'], inputs['token_type_ids']]))
\ No newline at end of file
#! -*- coding: utf-8 -*-
# 通过简单修改词表,使得不区分大小写的模型有区分大小写的能力
# 基本思路:将英文单词大写化后添加到词表中,并修改模型Embedding层
from bert4torch.models import build_transformer_model
from bert4torch.tokenizers import Tokenizer, load_vocab
import torch
root_model_path = "F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
vocab_path = root_model_path + "/vocab.txt"
config_path = root_model_path + "/bert_config.json"
checkpoint_path = root_model_path + '/pytorch_model.bin'
token_dict = load_vocab(vocab_path)
new_token_dict = token_dict.copy()
compound_tokens = []
for t, i in sorted(token_dict.items(), key=lambda s: s[1]):
# 这里主要考虑两种情况:1、首字母大写;2、整个单词大写。
# Python2下,新增了5594个token;Python3下,新增了5596个token。
tokens = []
if t.isalpha():
tokens.extend([t[:1].upper() + t[1:], t.upper()])
elif t[:2] == '##' and t[2:].isalpha():
tokens.append(t.upper())
for token in tokens:
if token not in new_token_dict:
compound_tokens.append([i])
new_token_dict[token] = len(new_token_dict)
tokenizer = Tokenizer(new_token_dict, do_lower_case=False)
model = build_transformer_model(
config_path,
checkpoint_path,
compound_tokens=compound_tokens, # 增加新token,用旧token平均来初始化
)
text = u'Welcome to BEIJING.'
tokens = tokenizer.tokenize(text)
print(tokens)
"""
输出:['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
"""
token_ids, segment_ids = tokenizer.encode(text)
token_ids, segment_ids = torch.tensor([token_ids]), torch.tensor([segment_ids])
model.eval()
with torch.no_grad():
print(model([token_ids, segment_ids])[0])
"""
输出:
[[[-1.4999904e-01 1.9651388e-01 -1.7924258e-01 ... 7.8269649e-01
2.2241375e-01 1.1325148e-01]
[-4.5268752e-02 5.5090344e-01 7.4699545e-01 ... -4.7773960e-01
-1.7562288e-01 4.1265407e-01]
[ 7.0158571e-02 1.7816302e-01 3.6949167e-01 ... 9.6258509e-01
-8.4678203e-01 6.3776302e-01]
...
[ 9.3637377e-01 3.0232478e-02 8.1411439e-01 ... 7.9186147e-01
7.5704646e-01 -8.3475001e-04]
[ 2.3699696e-01 2.9953337e-01 8.1962071e-02 ... -1.3776925e-01
3.8681498e-01 3.2553676e-01]
[ 1.9728680e-01 7.7782705e-02 5.2951699e-01 ... 8.9622810e-02
-2.3932748e-02 6.9600858e-02]]]
"""
# 多进程/线程parallel_apply测试
from tqdm import tqdm
from bert4torch.tokenizers import Tokenizer
import torch
import numpy as np
from bert4torch.snippets import parallel_apply
import time
dict_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
categories = {'LOC':2, 'PER':3, 'ORG':4}
# 相对距离设置
dis2idx = np.zeros((1000), dtype='int64')
dis2idx[1] = 1
dis2idx[2:] = 2
dis2idx[4:] = 3
dis2idx[8:] = 4
dis2idx[16:] = 5
dis2idx[32:] = 6
dis2idx[64:] = 7
dis2idx[128:] = 8
dis2idx[256:] = 9
# 用到的小函数
def convert_index_to_text(index, type):
text = "-".join([str(i) for i in index])
text = text + "-#-{}".format(type)
return text
def convert_text_to_index(text):
index, type = text.split("-#-")
index = [int(x) for x in index.split("-")]
return index, int(type)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
maxlen = 256
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in tqdm(f.split('\n\n'), desc='Load data'):
if not l:
continue
sentence, d = [], []
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
sentence += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
if len(sentence) > maxlen - 2:
continue
D.append((sentence, d))
return D
def func(inputs):
sentence, d = inputs
tokens = [tokenizer.tokenize(word)[1:-1] for word in sentence[:maxlen-2]]
pieces = [piece for pieces in tokens for piece in pieces]
tokens_ids = [tokenizer._token_start_id] + tokenizer.tokens_to_ids(pieces) + [tokenizer._token_end_id]
assert len(tokens_ids) <= maxlen
length = len(tokens)
# piece和word的对应关系,中文两者一致,除了[CLS]和[SEP]
_pieces2word = np.zeros((length, len(tokens_ids)), dtype=np.bool)
e_start = 0
for i, pieces in enumerate(tokens):
if len(pieces) == 0:
continue
pieces = list(range(e_start, e_start + len(pieces)))
_pieces2word[i, pieces[0] + 1:pieces[-1] + 2] = 1
e_start += len(pieces)
# 相对距离
_dist_inputs = np.zeros((length, length), dtype=np.int)
for k in range(length):
_dist_inputs[k, :] += k
_dist_inputs[:, k] -= k
for i in range(length):
for j in range(length):
if _dist_inputs[i, j] < 0:
_dist_inputs[i, j] = dis2idx[-_dist_inputs[i, j]] + 9
else:
_dist_inputs[i, j] = dis2idx[_dist_inputs[i, j]]
_dist_inputs[_dist_inputs == 0] = 19
# golden标签
_grid_labels = np.zeros((length, length), dtype=np.int)
_grid_mask2d = np.ones((length, length), dtype=np.bool)
for entity in d:
e_start, e_end, e_type = entity[0], entity[1]+1, entity[-1]
if e_end >= maxlen - 2:
continue
index = list(range(e_start, e_end))
for i in range(len(index)):
if i + 1 >= len(index):
break
_grid_labels[index[i], index[i + 1]] = 1
_grid_labels[index[-1], index[0]] = categories[e_type]
_entity_text = set([convert_index_to_text(list(range(e[0], e[1]+1)), categories[e[-1]]) for e in d])
return tokens_ids, _pieces2word, _dist_inputs, _grid_labels, _grid_mask2d, _entity_text
corpus = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train')
start = time.time()
train_samples = parallel_apply(
func=func,
iterable=corpus,
workers=8,
max_queue_size=2000,
dummy=False, # windows设置为True使用多进程
callback=None,
unordered=False
)
print(time.time()-start)
\ No newline at end of file
# 测试tokenizer和transformers自带的tokenizer是否一致,测试后是一致的
from transformers import BertTokenizer, XLNetTokenizer, XLNetTokenizerFast
from bert4torch.tokenizers import Tokenizer, SpTokenizer
from tqdm import tqdm
choice = 1
if choice:
print('Test BertTokenizer')
tokenizer_transformers = BertTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12")
tokenizer_bert4torch = Tokenizer('F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True, do_tokenize_unk=True)
else:
print('Test SpTokenizer')
tokenizer_transformers = XLNetTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base")
# tokenizer_transformers = XLNetTokenizerFast.from_pretrained("F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base") # fast版本有些许不一样
tokenizer_bert4torch = tokenizer = SpTokenizer('F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base/spiece.model', token_start=None, token_end=None)
with open('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data', 'r', encoding='utf-8') as f:
for l in tqdm(f):
l = l.split('\t')[0].strip()
tokens1 = tokenizer_transformers.tokenize(l)
tokens2 = tokenizer_bert4torch.tokenize(l)
tokens2 = tokens2[1:-1] if choice == 1 else tokens2
if tokens1 != tokens2:
print(''.join(tokens1))
print(''.join(tokens2))
print('------------------------------')
\ No newline at end of file
# 预训练权重config
记录bert4torch需要另外配置的config,部分权重可在转换脚本中查看
----
- xlnet/[hit_torch_base]--chinese-xlnet-base
```json
{
"architectures": [
"XLNetLMHeadModel"
],
"attn_type": "bi",
"bi_data": false,
"bos_token_id": 1,
"clamp_len": -1,
"intermediate_size": 3072,
"hidden_size": 768,
"hidden_dropout_prob": 0.1,
"end_n_top": 5,
"eos_token_id": 2,
"hidden_act": "relu",
"initializer_range": 0.02,
"layer_norm_eps": 1e-12,
"mem_len": null,
"model_type": "xlnet",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"output_past": true,
"pad_token_id": 5,
"reuse_len": null,
"same_length": false,
"start_n_top": 5,
"summary_activation": "tanh",
"summary_last_hidden_dropout_prob": 0.1,
"summary_type": "last",
"summary_use_proj": true,
"untie_r": true,
"vocab_size": 32000
}
```
- gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768
```json
{
"hidden_act": "swish",
"hidden_size": 768,
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"num_attention_heads": 1,
"attention_key_size": 128,
"intermediate_size": 1536,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 12000
}
```
- gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base
```json
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 513,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 13088,
"type_vocab_size": 3,
"shared_segment_embeddings": true
}
```
- gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b
```json
{
"vocab_size": 30000,
"hidden_size": 2560,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 10240,
"max_position_embeddings": 1024,
"num_attention_heads": 32,
"num_hidden_layers": 32
}
```
- gpt2/[gpt2-ml_torch_15g]
```json
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
```
- t5/[google_mt5_torch_base]
```json
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu_new",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 250112,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
```
- t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall
```json
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "relu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 21228,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
```
- t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall
```json
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "relu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"num_attention_heads": 8,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 21228,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
```
- t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small
```json
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"num_attention_heads": 6,
"attention_head_size": 64,
"num_hidden_layers": 8,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
```
- t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base
```json
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"num_attention_heads": 12,
"attention_head_size": 64,
"num_hidden_layers": 12,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
```
- bart/[FudanNLP_torch_base]
```json
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 21128
}
```
\ No newline at end of file
# tensorflow权重链接:https://github.com/ZhuiyiTechnology/GAU-alpha
# 这里直接映射到GAU_alpha的结构上了,因此不需要mapping
import torch
import tensorflow as tf
tf_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-tf]--chinese_GAU-alpha-char_L-24_H-768/bert_model.ckpt'
torch_state_dict = {}
ts = tf.train.load_variable(tf_path, 'bert/embeddings/word_embeddings')
torch_state_dict['embeddings.word_embeddings.weight'] = torch.from_numpy(ts)
torch_state_dict['mlmDecoder.weight'] = torch.from_numpy(ts)
ts = tf.train.load_variable(tf_path, 'bert/embeddings/token_type_embeddings')
torch_state_dict['embeddings.segment_embeddings.weight'] = torch.from_numpy(ts)
for i in range(24):
ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/i_dense/kernel')
torch_state_dict[f'encoderLayer.{i}.gau.i_dense.weight'] = torch.from_numpy(ts.T)
ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/o_dense/kernel')
torch_state_dict[f'encoderLayer.{i}.gau.o_dense.weight'] = torch.from_numpy(ts.T)
ts1 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/q_scaleoffset/gamma')
ts2 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/k_scaleoffset/gamma')
ts = torch.stack([torch.from_numpy(ts1), torch.from_numpy(ts2)], dim=0)
torch_state_dict[f'encoderLayer.{i}.gau.offsetscale.gamma'] = ts
torch.save(torch_state_dict, 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin')
# config文件
'''
{
"hidden_act": "swish",
"hidden_size": 768,
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"num_attention_heads": 1,
"attention_key_size": 128,
"intermediate_size": 1536,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 12000
}
'''
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment