Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
0e29b9b7
Commit
0e29b9b7
authored
Jan 20, 2026
by
xuxo
Browse files
yidong infer init
parents
Pipeline
#3252
failed with stages
in 0 seconds
Changes
150
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1330 additions
and
0 deletions
+1330
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_CDial_GPT.py
...rch_cmcc/examples/basic/basic_language_model_CDial_GPT.py
+58
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_GAU_alpha.py
...rch_cmcc/examples/basic/basic_language_model_GAU_alpha.py
+33
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_bart.py
...rt4torch_cmcc/examples/basic/basic_language_model_bart.py
+46
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_bert.py
...rt4torch_cmcc/examples/basic/basic_language_model_bert.py
+31
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_cpm_lm.py
...4torch_cmcc/examples/basic/basic_language_model_cpm_lm.py
+121
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_ernie.py
...t4torch_cmcc/examples/basic/basic_language_model_ernie.py
+33
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_gpt2_ml.py
...torch_cmcc/examples/basic/basic_language_model_gpt2_ml.py
+57
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gen_gpt.py
...cmcc/examples/basic/basic_language_model_nezha_gen_gpt.py
+60
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gpt_dialog.py
...c/examples/basic/basic_language_model_nezha_gpt_dialog.py
+54
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_roformer.py
...orch_cmcc/examples/basic/basic_language_model_roformer.py
+45
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_simbert.py
...torch_cmcc/examples/basic/basic_language_model_simbert.py
+130
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_t5_pegasus.py
...ch_cmcc/examples/basic/basic_language_model_t5_pegasus.py
+55
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_transformer_xl.py
...mcc/examples/basic/basic_language_model_transformer_xl.py
+41
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_uer_t5.py
...4torch_cmcc/examples/basic/basic_language_model_uer_t5.py
+53
-0
bert/bert4torch_cmcc/examples/basic/basic_language_model_xlnet.py
...t4torch_cmcc/examples/basic/basic_language_model_xlnet.py
+28
-0
bert/bert4torch_cmcc/examples/basic/basic_make_uncased_model_cased.py
...rch_cmcc/examples/basic/basic_make_uncased_model_cased.py
+67
-0
bert/bert4torch_cmcc/examples/basic/basic_test_parallel_apply.py
...rt4torch_cmcc/examples/basic/basic_test_parallel_apply.py
+125
-0
bert/bert4torch_cmcc/examples/basic/basic_test_tokenizer.py
bert/bert4torch_cmcc/examples/basic/basic_test_tokenizer.py
+27
-0
bert/bert4torch_cmcc/examples/convert_script/PLM_config.md
bert/bert4torch_cmcc/examples/convert_script/PLM_config.md
+219
-0
bert/bert4torch_cmcc/examples/convert_script/convert_GAU_alpha.py
...t4torch_cmcc/examples/convert_script/convert_GAU_alpha.py
+47
-0
No files found.
Too many changes to show.
To preserve performance only
150 of 150+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/basic/basic_language_model_CDial_GPT.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT模型,base版本,CDial-GPT版
# 项目链接:https://github.com/thu-coai/CDial-GPT
# 参考项目:https://github.com/bojone/CDial-GPT-tf
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import
torch
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
config_path
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 建立分词器
speakers
=
[
tokenizer
.
token_to_id
(
'[speaker1]'
),
tokenizer
.
token_to_id
(
'[speaker2]'
)]
# config中设置shared_segment_embeddings=True,segment embedding用word embedding的权重生成
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'gpt'
,
).
to
(
device
)
# 建立模型,加载权重
class
ChatBot
(
AutoRegressiveDecoder
):
"""基于随机采样的闲聊回复
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
curr_segment_ids
=
torch
.
zeros_like
(
output_ids
)
+
token_ids
[
0
,
-
1
]
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
curr_segment_ids
],
1
)
logits
=
model
.
predict
([
token_ids
,
segment_ids
])
return
logits
[:,
-
1
,
:]
def
response
(
self
,
texts
,
n
=
1
,
topk
=
5
):
token_ids
=
[
tokenizer
.
_token_start_id
,
speakers
[
0
]]
segment_ids
=
[
tokenizer
.
_token_start_id
,
speakers
[
0
]]
for
i
,
text
in
enumerate
(
texts
):
ids
=
tokenizer
.
encode
(
text
)[
0
][
1
:
-
1
]
+
[
speakers
[(
i
+
1
)
%
2
]]
token_ids
.
extend
(
ids
)
segment_ids
.
extend
([
speakers
[
i
%
2
]]
*
len
(
ids
))
segment_ids
[
-
1
]
=
speakers
[(
i
+
1
)
%
2
]
results
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
tokenizer
.
decode
(
results
[
0
].
cpu
().
numpy
())
chatbot
=
ChatBot
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
print
(
chatbot
.
response
([
u
'别爱我没结果'
,
u
'你这样会失去我的'
,
u
'失去了又能怎样'
]))
"""
回复是随机的,例如:你还有我 | 那就不要爱我 | 你是不是傻 | 等等。
"""
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_language_model_GAU_alpha.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基础测试:GAU_alpha的mlm预测,和bert4keras版本比对一致
# 测试中长文本效果明显高于短文本效果
# 博客:https://kexue.fm/archives/9052
# 权重转换脚本:./convert_script/convert_GAU_alpha.py
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch
# 加载模型,请更换成自己的路径
config_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'gau_alpha'
,
with_mlm
=
'softmax'
)
# 建立模型,加载权重
token_ids
,
segments_ids
=
tokenizer
.
encode
(
"近期正是上市公司财报密集披露的时间,但有多家龙头公司的业绩令投资者失望"
)
token_ids
[
5
]
=
token_ids
[
6
]
=
tokenizer
.
_token_mask_id
print
(
''
.
join
(
tokenizer
.
ids_to_tokens
(
token_ids
)))
tokens_ids_tensor
=
torch
.
tensor
([
token_ids
])
segment_ids_tensor
=
torch
.
tensor
([
segments_ids
])
# 需要传入参数with_mlm
model
.
eval
()
with
torch
.
no_grad
():
_
,
probas
=
model
([
tokens_ids_tensor
,
segment_ids_tensor
])
result
=
torch
.
argmax
(
probas
[
0
,
5
:
7
],
dim
=-
1
).
numpy
()
print
(
tokenizer
.
decode
(
result
))
bert/bert4torch_cmcc/examples/basic/basic_language_model_bart.py
0 → 100644
View file @
0e29b9b7
# 测试bart语言模型的预测效果
# bert4torch需要转换一下权重,见convert文件夹中
from
transformers
import
BertTokenizer
,
BartForConditionalGeneration
tokenizer
=
BertTokenizer
.
from_pretrained
(
"F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/"
)
model
=
BartForConditionalGeneration
.
from_pretrained
(
"F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/"
)
input_ids
=
tokenizer
.
encode
(
"北京是[MASK]的首都"
,
return_tensors
=
'pt'
)
pred_ids
=
model
.
generate
(
input_ids
,
num_beams
=
4
,
max_length
=
20
)
print
(
'transformers output: '
,
tokenizer
.
convert_ids_to_tokens
(
pred_ids
[
0
]))
# 输出: ['[SEP]', '[CLS]', '北', '京', '是', '中', '国', '的', '首', '都', '[SEP]']
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
import
torch
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'bart'
,
segment_vocab_size
=
0
).
to
(
device
)
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
4
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
128
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
102
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
print
(
'bert4torch output: '
,
autotitle
.
generate
(
"北京是[MASK]的首都"
))
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_language_model_bert.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基础测试:mlm预测
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch
# 加载模型,请更换成自己的路径
root_model_path
=
"F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
vocab_path
=
root_model_path
+
"/vocab.txt"
config_path
=
root_model_path
+
"/bert_config.json"
checkpoint_path
=
root_model_path
+
'/pytorch_model.bin'
# 建立分词器
tokenizer
=
Tokenizer
(
vocab_path
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
'softmax'
)
# 建立模型,加载权重
token_ids
,
segments_ids
=
tokenizer
.
encode
(
"科学技术是第一生产力"
)
token_ids
[
3
]
=
token_ids
[
4
]
=
tokenizer
.
_token_mask_id
print
(
''
.
join
(
tokenizer
.
ids_to_tokens
(
token_ids
)))
tokens_ids_tensor
=
torch
.
tensor
([
token_ids
])
segment_ids_tensor
=
torch
.
tensor
([
segments_ids
])
# 需要传入参数with_mlm
model
.
eval
()
with
torch
.
no_grad
():
_
,
probas
=
model
([
tokens_ids_tensor
,
segment_ids_tensor
])
result
=
torch
.
argmax
(
probas
[
0
,
3
:
5
],
dim
=-
1
).
numpy
()
print
(
tokenizer
.
decode
(
result
))
bert/bert4torch_cmcc/examples/basic/basic_language_model_cpm_lm.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基本测试:清华开源的中文GPT2模型(26亿参数)
# 项目链接:https://github.com/TsinghuaAI/CPM-Generate
# 博客介绍:https://kexue.fm/archives/7912
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
SpTokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
import
torch
import
jieba
jieba
.
initialize
()
# 模型路径
config_path
=
'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_pytorch_model.bin'
spm_path
=
'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/chinese_vocab.model'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
pre_tokenize
(
text
):
"""分词前处理函数,'
\n
'替换成'▃', ' '替换成'▂'
"""
return
[
w
.
replace
(
' '
,
u
'
\u2582
'
).
replace
(
'
\n
'
,
u
'
\u2583
'
)
for
w
in
jieba
.
cut
(
text
,
cut_all
=
False
)
]
tokenizer
=
SpTokenizer
(
spm_path
,
token_start
=
None
,
token_end
=
None
,
pre_tokenize
=
pre_tokenize
,
token_translate
=
{
u
'
\u2583
'
:
'<cls>'
}
# '\n'替换成<cls>
)
# 建立分词器
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'gpt2'
,
segment_vocab_size
=
0
).
to
(
device
)
# 建立模型,加载权重
class
TextExpansion
(
AutoRegressiveDecoder
):
"""基于随机采样的文本续写
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
torch
.
cat
([
inputs
[
0
],
output_ids
],
1
)
logits
=
model
.
predict
([
token_ids
])
return
logits
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topp
=
0.95
,
temperature
=
1
):
"""输出结果会有一定的随机性,如果只关心Few Shot效果,
可以考虑将解码方式换为beam search。
"""
token_ids
,
_
=
tokenizer
.
encode
(
text
)
results
=
self
.
random_sample
([
token_ids
],
n
,
topp
=
topp
,
temperature
=
temperature
)
# 基于随机采样
results
=
[
token_ids
+
[
int
(
i
)
for
i
in
ids
.
cpu
().
numpy
()]
for
ids
in
results
]
texts
=
[
tokenizer
.
decode
(
ids
)
for
ids
in
results
]
return
[
self
.
post_replace
(
text
)
for
text
in
texts
]
def
post_replace
(
self
,
text
):
for
s
,
t
in
[(
' '
,
''
),
(
u
'
\u2582
'
,
' '
),
(
u
'
\u2583
'
,
'
\n
'
)]:
text
=
text
.
replace
(
s
,
t
)
return
text
text_expansion
=
TextExpansion
(
start_id
=
None
,
end_id
=
3
,
# 3是<cls>,也是换行符
maxlen
=
16
,
device
=
device
)
# 常识推理
# 本例输出:北京
query
=
u
"""
美国的首都是华盛顿
法国的首都是巴黎
日本的首都是东京
中国的首都是
"""
print
(
text_expansion
.
generate
(
query
[
1
:
-
1
],
1
)[
0
])
# 单词翻译
# 本例输出:bird
query
=
u
"""
狗 dog
猫 cat
猪 pig
鸟
"""
print
(
text_expansion
.
generate
(
query
[
1
:
-
1
],
1
)[
0
])
# 主语抽取
# 本例输出:杨振宁
query
=
u
"""
从1931年起,华罗庚在清华大学边学习边工作 华罗庚
在一间简陋的房间里,陈景润攻克了“哥德巴赫猜想” 陈景润
在这里,丘成桐得到IBM奖学金 丘成桐
杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献
"""
print
(
text_expansion
.
generate
(
query
[
1
:
-
1
],
1
)[
0
])
# 三元组抽取
# 本例输出:张红,体重,140斤
query
=
u
"""
姚明的身高是211cm,是很多人心目中的偶像。 ->姚明,身高,211cm
毛泽东是绍兴人,早年在长沙读书。->毛泽东,出生地,绍兴
虽然周杰伦在欧洲办的婚礼,但是他是土生土长的中国人->周杰伦,国籍,中国
小明出生于武汉,但是却不喜欢在武汉生成,长大后去了北京。->小明,出生地,武汉
吴亦凡是很多人的偶像,但是他却是加拿大人,另很多人失望->吴亦凡,国籍,加拿大
武耀的生日在5月8号,这一天,大家都为他庆祝了生日->武耀,生日,5月8号
《青花瓷》是周杰伦最得意的一首歌。->周杰伦,作品,《青花瓷》
北京是中国的首都。->中国,首都,北京
蒋碧的家乡在盘龙城,毕业后去了深圳工作。->蒋碧,籍贯,盘龙城
上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立,籍贯,云南
昨天11月17号,我和朋友一起去了海底捞,期间服务员为我的朋友刘章庆祝了生日。->刘章,生日,11月17号
张红的体重达到了140斤,她很苦恼。->
"""
print
(
text_expansion
.
generate
(
query
[
1
:
-
1
],
1
)[
0
])
bert/bert4torch_cmcc/examples/basic/basic_language_model_ernie.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基础测试:ERNIE模型测试
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch
# 加载模型,请更换成自己的路径
root_model_path
=
"F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-1-base-zh"
# root_model_path = "F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-3-base-zh"
vocab_path
=
root_model_path
+
"/vocab.txt"
config_path
=
root_model_path
+
"/config.json"
checkpoint_path
=
root_model_path
+
'/pytorch_model.bin'
# 建立分词器
tokenizer
=
Tokenizer
(
vocab_path
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'ERNIE'
,
with_mlm
=
'softmax'
)
# 建立模型,加载权重
token_ids
,
segments_ids
=
tokenizer
.
encode
(
"科学技术是第一生产力"
)
token_ids
[
3
]
=
token_ids
[
4
]
=
tokenizer
.
_token_mask_id
print
(
''
.
join
(
tokenizer
.
ids_to_tokens
(
token_ids
)))
tokens_ids_tensor
=
torch
.
tensor
([
token_ids
])
segment_ids_tensor
=
torch
.
tensor
([
segments_ids
])
# 需要传入参数
model
.
eval
()
with
torch
.
no_grad
():
_
,
probas
=
model
([
tokens_ids_tensor
,
segment_ids_tensor
])
result
=
torch
.
argmax
(
probas
[
0
,
3
:
5
],
dim
=-
1
).
numpy
()
print
(
tokenizer
.
decode
(
result
))
bert/bert4torch_cmcc/examples/basic/basic_language_model_gpt2_ml.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基本测试:gpt2_ml的效果测试
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# 权重需转换后方可加载,转换脚本见convert_script文件夹
import
torch
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
config_path
=
'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
tokenizer
=
Tokenizer
(
dict_path
,
token_start
=
None
,
token_end
=
None
,
do_lower_case
=
True
)
# 建立分词器
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'gpt2_ml'
,
segment_vocab_size
=
0
).
to
(
device
)
# 建立模型,加载权重
class
ArticleCompletion
(
AutoRegressiveDecoder
):
"""基于随机采样的文章续写
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
torch
.
cat
([
inputs
[
0
],
output_ids
],
1
)
logits
=
model
.
predict
([
token_ids
])
return
logits
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topp
=
0.95
):
token_ids
,
_
=
tokenizer
.
encode
(
text
)
results
=
self
.
random_sample
([
token_ids
],
n
,
topp
=
topp
)
# 基于随机采样
return
[
text
+
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
article_completion
=
ArticleCompletion
(
start_id
=
None
,
end_id
=
511
,
# 511是中文句号
maxlen
=
256
,
minlen
=
128
,
device
=
device
)
for
text
in
[
u
'今天天气不错'
,
u
'双十一'
,
u
'科学空间'
]:
print
(
article_completion
.
generate
(
text
))
"""
部分结果:
>>> article_completion.generate(u'今天天气不错')
[u'今天天气不错。昨天的天气是多云到晴的天气,今天的天气还不错,不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热,最高温度在30℃左右,明后两天天气会更加热。预计今天的最高温度为30℃,明后两天的最 高温度为32℃左右,今天的最高气温将在30℃左右。(记者李莉)。新华网重庆频道诚邀广大网友投稿,您可以用相机或手机记录下身边的感人故事,精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们,我们将精选其中的好图、美图在页面上展示,让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年,重庆市各级公安机关在全力抓好']
>>> article_completion.generate(u'双十一')
[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
>>> article_completion.generate(u'科学空间')
[u'科学空间站上的两个机器人在进入轨道后,一边在轨道上工作,一边用它们的身体和心脏在空间站上的一个大气层进行活动,以确保它们在进入地球之后不会因太阳风暴而受到影响;而另外一个机器人则在进入轨道的过程中,通 过机器人与地球上的大气层相互作用,使地球的大气层不断地向地球的大气层中转移,以使其能够在空间站上工作,并且使用它们的身体和心脏来完成它们的各种任务。']
"""
bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gen_gpt.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT模型,base版本,华为开源的
# 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn,这里使用的是转pytorch后的模型文件
# 参考项目:https://github.com/bojone/chinese-gen
import
torch
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
config_path
=
'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 建立分词器
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
,
# 去掉segmeng_ids输入
application
=
'lm'
,
).
to
(
device
)
# 建立模型,加载权重
class
ArticleCompletion
(
AutoRegressiveDecoder
):
"""基于随机采样的文章续写
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
torch
.
cat
([
inputs
[
0
],
output_ids
],
1
)
_
,
mlm_scores
=
model
.
predict
([
token_ids
])
return
mlm_scores
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topp
=
0.95
):
token_ids
=
tokenizer
.
encode
(
text
)[
0
][:
-
1
]
results
=
self
.
random_sample
([
token_ids
],
n
,
topp
=
topp
)
# 基于随机采样
return
[
text
+
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
article_completion
=
ArticleCompletion
(
start_id
=
None
,
end_id
=
511
,
# 511是中文句号
maxlen
=
256
,
minlen
=
128
,
device
=
device
)
print
(
article_completion
.
generate
(
u
'今天天气不错'
))
"""
部分结果:
>>> article_completion.generate(u'今天天气不错')
[u'今天天气不错。昨天的天气是多云到晴的天气,今天的天气还不错,不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热,最高温度在30℃左右,明后两天天气会更加热。预计今天的最高温度为30℃,明后两天的最 高温度为32℃左右,今天的最高气温将在30℃左右。(记者李莉)。新华网重庆频道诚邀广大网友投稿,您可以用相机或手机记录下身边的感人故事,精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们,我们将精选其中的好图、美图在页面上展示,让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年,重庆市各级公安机关在全力抓好']
>>> article_completion.generate(u'双十一')
[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
>>> article_completion.generate(u'科学空间')
[u'科学空间站上的两个机器人在进入轨道后,一边在轨道上工作,一边用它们的身体和心脏在空间站上的一个大气层进行活动,以确保它们在进入地球之后不会因太阳风暴而受到影响;而另外一个机器人则在进入轨道的过程中,通 过机器人与地球上的大气层相互作用,使地球的大气层不断地向地球的大气层中转移,以使其能够在空间站上工作,并且使用它们的身体和心脏来完成它们的各种任务。']
"""
bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gpt_dialog.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# NEZHA模型做闲聊任务,这里只提供了测试脚本
# 源项目:https://github.com/bojone/nezha_gpt_dialog
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_nezha_gpt_dialog.py
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
AutoRegressiveDecoder
import
torch
# nezha配置
config_path
=
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/vocab.txt'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 建立并加载模型
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'nezha'
,
application
=
'lm'
,
)
class
ChatBot
(
AutoRegressiveDecoder
):
"""基于随机采样对话机器人
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
concat
([
token_ids
,
output_ids
],
1
)
curr_segment_ids
=
torch
.
ones_like
(
output_ids
)
-
segment_ids
[
0
,
-
1
]
segment_ids
=
torch
.
concat
([
segment_ids
,
curr_segment_ids
],
1
)
return
model
.
predict
([
token_ids
,
segment_ids
])[
-
1
][:,
-
1
]
def
response
(
self
,
texts
,
topk
=
5
):
token_ids
,
segment_ids
=
[
tokenizer
.
_token_start_id
],
[
0
]
for
i
,
text
in
enumerate
(
texts
):
ids
=
tokenizer
.
encode
(
text
)[
0
][
1
:]
token_ids
.
extend
(
ids
)
segment_ids
.
extend
([
i
%
2
]
*
len
(
ids
))
results
=
self
.
random_sample
([
token_ids
,
segment_ids
],
1
,
topk
)
return
tokenizer
.
decode
(
results
[
0
].
cpu
().
numpy
())
chatbot
=
ChatBot
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
)
print
(
chatbot
.
response
([
u
'别爱我没结果'
,
u
'你这样会失去我的'
,
u
'失去了又能怎样'
]))
"""
回复是随机的,例如:那你还爱我吗 | 不知道 | 爱情是不是不能因为一点小事就否定了 | 我会一直爱你,你一个人会很辛苦 | 等等。
"""
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_language_model_roformer.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 基础测试:mlm测试roformer、roformer_v2模型
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch
choice
=
'roformer_v2'
# roformer roformer_v2
if
choice
==
'roformer'
:
args_model_path
=
"F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/"
args_model
=
'roformer'
else
:
args_model_path
=
"F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/"
args_model
=
'roformer_v2'
# 加载模型,请更换成自己的路径
root_model_path
=
args_model_path
vocab_path
=
root_model_path
+
"/vocab.txt"
config_path
=
root_model_path
+
"/config.json"
checkpoint_path
=
root_model_path
+
'/pytorch_model.bin'
# 建立分词器
tokenizer
=
Tokenizer
(
vocab_path
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
args_model
,
with_mlm
=
'softmax'
)
# 建立模型,加载权重
token_ids
,
segments_ids
=
tokenizer
.
encode
(
"今天M很好,我M去公园玩。"
)
token_ids
[
3
]
=
token_ids
[
8
]
=
tokenizer
.
_token_mask_id
print
(
''
.
join
(
tokenizer
.
ids_to_tokens
(
token_ids
)))
tokens_ids_tensor
=
torch
.
tensor
([
token_ids
])
segment_ids_tensor
=
torch
.
tensor
([
segments_ids
])
# 需要传入参数with_mlm
model
.
eval
()
with
torch
.
no_grad
():
_
,
logits
=
model
([
tokens_ids_tensor
,
segment_ids_tensor
])
pred_str
=
'Predict: '
for
i
,
logit
in
enumerate
(
logits
[
0
]):
if
token_ids
[
i
]
==
tokenizer
.
_token_mask_id
:
pred_str
+=
tokenizer
.
id_to_token
(
torch
.
argmax
(
logit
,
dim
=-
1
).
item
())
else
:
pred_str
+=
tokenizer
.
id_to_token
(
token_ids
[
i
])
print
(
pred_str
)
bert/bert4torch_cmcc/examples/basic/basic_language_model_simbert.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# SimBERT/RoFormer-Sim测试相似问生成效果,以及句子之间相似度效果
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
torch
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
AutoRegressiveDecoder
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
# 基本信息
maxlen
=
32
choice
=
'simbert_v2'
# simbert simbert_v2
if
choice
==
'simbert'
:
args_model_path
=
"F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base"
args_model
=
'bert'
else
:
args_model_path
=
"F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base"
args_model
=
'roformer'
# 加载simbert权重或roformer_v2
root_model_path
=
args_model_path
dict_path
=
root_model_path
+
"/vocab.txt"
config_path
=
root_model_path
+
"/config.json"
checkpoint_path
=
root_model_path
+
'/pytorch_model.bin'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
'linear'
,
model
=
args_model
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pooler
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
if
__name__
==
'__main__'
:
choice
=
'generate'
# generate similarity
if
choice
==
'generate'
:
print
(
gen_synonyms
(
'我想去北京玩玩可以吗'
,
10
,
10
))
elif
choice
==
'similarity'
:
target_text
=
'我想去首都北京玩玩'
text_list
=
[
'我想去北京玩'
,
'北京有啥好玩的吗?我想去看看'
,
'好渴望去北京游玩啊'
]
Z
=
cal_sen_emb
([
target_text
]
+
text_list
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
similarity
=
torch
.
matmul
(
Z
[
1
:],
Z
[
0
])
for
i
,
line
in
enumerate
(
text_list
):
print
(
f
'cos_sim:
{
similarity
[
i
].
item
():.
4
f
}
, tgt_text: "
{
target_text
}
", cal_text: "
{
line
}
"'
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/basic/basic_language_model_t5_pegasus.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 调用T5 PEGASUS, 使用到是BertTokenizer
import
torch
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
AutoRegressiveDecoder
import
jieba
jieba
.
initialize
()
# bert配置
# pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/'
pretrain_model
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path
=
pretrain_model
+
'config.json'
checkpoint_path
=
pretrain_model
+
'pytorch_model.bin'
dict_path
=
pretrain_model
+
'vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
cut
(
s
,
HMM
=
False
)
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'mt5.1.1'
,
segment_vocab_size
=
0
).
to
(
device
)
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
256
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
([
int
(
i
)
for
i
in
output_ids
.
cpu
().
numpy
()])
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
# 这里end_id可以设置为tokenizer._token_end_id这样结果更短
if
__name__
==
'__main__'
:
print
(
autotitle
.
generate
(
'今天天气不错啊'
))
# small版输出:我是个女的,我想知道我是怎么想的
# base版输出:请问明天的天气怎么样啊?
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_language_model_transformer_xl.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 调用transformer_xl模型,该模型流行度较低,未找到中文预训练模型
# last_hidden_state目前是debug到transformer包中查看,经比对和本框架一致
# 用的是transformer中的英文预训练模型来验证正确性
# 转换脚本: convert_script/convert_transformer_xl.py
from
transformers
import
AutoTokenizer
,
AutoModelForCausalLM
import
torch
pretrained_model
=
"F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103"
# ----------------------transformers包----------------------
tokenizer
=
AutoTokenizer
.
from_pretrained
(
pretrained_model
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
pretrained_model
)
model
.
eval
()
inputs
=
tokenizer
(
"Hello, my dog is cute"
,
return_tensors
=
"pt"
)
with
torch
.
no_grad
():
# 这里只能断点进去看
outputs
=
model
(
**
inputs
,
labels
=
inputs
[
"input_ids"
])
loss
=
outputs
.
losses
print
(
'transforms loss: '
,
loss
)
# ----------------------bert4torch配置----------------------
from
bert4torch.models
import
build_transformer_model
config_path
=
f
'
{
pretrained_model
}
/bert4torch_config.json'
checkpoint_path
=
f
'
{
pretrained_model
}
/bert4torch_pytorch_model.bin'
model
=
build_transformer_model
(
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'transformer_xl'
,
)
print
(
'bert4torch last_hidden_state: '
,
model
.
predict
([
inputs
[
'input_ids'
]]))
# tensor([[[ 0.1027, 0.0604, -0.2585, ..., 0.3137, -0.2679, 0.1036],
# [ 0.3482, -0.0458, -0.4582, ..., 0.0242, -0.0721, 0.2311],
# [ 0.3426, -0.1353, -0.4145, ..., 0.1123, 0.1374, 0.1313],
# [ 0.0038, -0.0978, -0.5570, ..., 0.0487, -0.1891, -0.0608],
# [-0.2155, -0.1388, -0.5549, ..., -0.1458, 0.0774, 0.0419],
# [ 0.0967, -0.1781, -0.4328, ..., -0.1831, -0.0808, 0.0890]]])
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_language_model_uer_t5.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 调用预训练的t5-chinese模型直接做预测,使用的BertTokenizer
# t5使用的是t5.1.0的结构
import
torch
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
AutoRegressiveDecoder
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/vocab.txt'
# config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
# checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
# dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
False
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
't5.1.0'
,
segment_vocab_size
=
0
).
to
(
device
)
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
inputs
[
0
]
return
model
.
predict
([[
token_ids
],
[
output_ids
]])[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
,
topp
=
0.95
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
256
)
output_ids
=
self
.
beam_search
([
token_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
1
,
maxlen
=
32
,
device
=
device
)
# 这里end_id可以设置为tokenizer._token_end_id这样结果更短
if
__name__
==
'__main__'
:
print
(
autotitle
.
generate
(
'中国的首都是extra0京'
))
bert/bert4torch_cmcc/examples/basic/basic_language_model_xlnet.py
0 → 100644
View file @
0e29b9b7
from
transformers
import
XLNetTokenizer
,
XLNetModel
import
torch
pretrained_model
=
"F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base"
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
pretrained_model
)
model
=
XLNetModel
.
from_pretrained
(
pretrained_model
)
inputs
=
tokenizer
([
"你好啊,我叫张三"
,
"天气不错啊"
],
padding
=
True
,
return_tensors
=
"pt"
)
outputs
=
model
(
**
inputs
)
last_hidden_states
=
outputs
.
last_hidden_state
print
(
'--------transformers last_hidden_state--------
\n
'
,
last_hidden_states
)
# ----------------------bert4torch配置----------------------
from
bert4torch.models
import
build_transformer_model
config_path
=
f
'
{
pretrained_model
}
/bert4torch_config.json'
checkpoint_path
=
f
'
{
pretrained_model
}
/pytorch_model.bin'
model
=
build_transformer_model
(
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'xlnet'
,
# with_lm=True
token_pad_ids
=
tokenizer
.
pad_token_id
,
)
print
(
'--------bert4torch last_hidden_state--------
\n
'
,
model
.
predict
([
inputs
[
'input_ids'
],
inputs
[
'token_type_ids'
]]))
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_make_uncased_model_cased.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 通过简单修改词表,使得不区分大小写的模型有区分大小写的能力
# 基本思路:将英文单词大写化后添加到词表中,并修改模型Embedding层
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
import
torch
root_model_path
=
"F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
vocab_path
=
root_model_path
+
"/vocab.txt"
config_path
=
root_model_path
+
"/bert_config.json"
checkpoint_path
=
root_model_path
+
'/pytorch_model.bin'
token_dict
=
load_vocab
(
vocab_path
)
new_token_dict
=
token_dict
.
copy
()
compound_tokens
=
[]
for
t
,
i
in
sorted
(
token_dict
.
items
(),
key
=
lambda
s
:
s
[
1
]):
# 这里主要考虑两种情况:1、首字母大写;2、整个单词大写。
# Python2下,新增了5594个token;Python3下,新增了5596个token。
tokens
=
[]
if
t
.
isalpha
():
tokens
.
extend
([
t
[:
1
].
upper
()
+
t
[
1
:],
t
.
upper
()])
elif
t
[:
2
]
==
'##'
and
t
[
2
:].
isalpha
():
tokens
.
append
(
t
.
upper
())
for
token
in
tokens
:
if
token
not
in
new_token_dict
:
compound_tokens
.
append
([
i
])
new_token_dict
[
token
]
=
len
(
new_token_dict
)
tokenizer
=
Tokenizer
(
new_token_dict
,
do_lower_case
=
False
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
compound_tokens
=
compound_tokens
,
# 增加新token,用旧token平均来初始化
)
text
=
u
'Welcome to BEIJING.'
tokens
=
tokenizer
.
tokenize
(
text
)
print
(
tokens
)
"""
输出:['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
"""
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
)
token_ids
,
segment_ids
=
torch
.
tensor
([
token_ids
]),
torch
.
tensor
([
segment_ids
])
model
.
eval
()
with
torch
.
no_grad
():
print
(
model
([
token_ids
,
segment_ids
])[
0
])
"""
输出:
[[[-1.4999904e-01 1.9651388e-01 -1.7924258e-01 ... 7.8269649e-01
2.2241375e-01 1.1325148e-01]
[-4.5268752e-02 5.5090344e-01 7.4699545e-01 ... -4.7773960e-01
-1.7562288e-01 4.1265407e-01]
[ 7.0158571e-02 1.7816302e-01 3.6949167e-01 ... 9.6258509e-01
-8.4678203e-01 6.3776302e-01]
...
[ 9.3637377e-01 3.0232478e-02 8.1411439e-01 ... 7.9186147e-01
7.5704646e-01 -8.3475001e-04]
[ 2.3699696e-01 2.9953337e-01 8.1962071e-02 ... -1.3776925e-01
3.8681498e-01 3.2553676e-01]
[ 1.9728680e-01 7.7782705e-02 5.2951699e-01 ... 8.9622810e-02
-2.3932748e-02 6.9600858e-02]]]
"""
bert/bert4torch_cmcc/examples/basic/basic_test_parallel_apply.py
0 → 100644
View file @
0e29b9b7
# 多进程/线程parallel_apply测试
from
tqdm
import
tqdm
from
bert4torch.tokenizers
import
Tokenizer
import
torch
import
numpy
as
np
from
bert4torch.snippets
import
parallel_apply
import
time
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
categories
=
{
'LOC'
:
2
,
'PER'
:
3
,
'ORG'
:
4
}
# 相对距离设置
dis2idx
=
np
.
zeros
((
1000
),
dtype
=
'int64'
)
dis2idx
[
1
]
=
1
dis2idx
[
2
:]
=
2
dis2idx
[
4
:]
=
3
dis2idx
[
8
:]
=
4
dis2idx
[
16
:]
=
5
dis2idx
[
32
:]
=
6
dis2idx
[
64
:]
=
7
dis2idx
[
128
:]
=
8
dis2idx
[
256
:]
=
9
# 用到的小函数
def
convert_index_to_text
(
index
,
type
):
text
=
"-"
.
join
([
str
(
i
)
for
i
in
index
])
text
=
text
+
"-#-{}"
.
format
(
type
)
return
text
def
convert_text_to_index
(
text
):
index
,
type
=
text
.
split
(
"-#-"
)
index
=
[
int
(
x
)
for
x
in
index
.
split
(
"-"
)]
return
index
,
int
(
type
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
maxlen
=
256
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
tqdm
(
f
.
split
(
'
\n\n
'
),
desc
=
'Load data'
):
if
not
l
:
continue
sentence
,
d
=
[],
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
sentence
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
if
len
(
sentence
)
>
maxlen
-
2
:
continue
D
.
append
((
sentence
,
d
))
return
D
def
func
(
inputs
):
sentence
,
d
=
inputs
tokens
=
[
tokenizer
.
tokenize
(
word
)[
1
:
-
1
]
for
word
in
sentence
[:
maxlen
-
2
]]
pieces
=
[
piece
for
pieces
in
tokens
for
piece
in
pieces
]
tokens_ids
=
[
tokenizer
.
_token_start_id
]
+
tokenizer
.
tokens_to_ids
(
pieces
)
+
[
tokenizer
.
_token_end_id
]
assert
len
(
tokens_ids
)
<=
maxlen
length
=
len
(
tokens
)
# piece和word的对应关系,中文两者一致,除了[CLS]和[SEP]
_pieces2word
=
np
.
zeros
((
length
,
len
(
tokens_ids
)),
dtype
=
np
.
bool
)
e_start
=
0
for
i
,
pieces
in
enumerate
(
tokens
):
if
len
(
pieces
)
==
0
:
continue
pieces
=
list
(
range
(
e_start
,
e_start
+
len
(
pieces
)))
_pieces2word
[
i
,
pieces
[
0
]
+
1
:
pieces
[
-
1
]
+
2
]
=
1
e_start
+=
len
(
pieces
)
# 相对距离
_dist_inputs
=
np
.
zeros
((
length
,
length
),
dtype
=
np
.
int
)
for
k
in
range
(
length
):
_dist_inputs
[
k
,
:]
+=
k
_dist_inputs
[:,
k
]
-=
k
for
i
in
range
(
length
):
for
j
in
range
(
length
):
if
_dist_inputs
[
i
,
j
]
<
0
:
_dist_inputs
[
i
,
j
]
=
dis2idx
[
-
_dist_inputs
[
i
,
j
]]
+
9
else
:
_dist_inputs
[
i
,
j
]
=
dis2idx
[
_dist_inputs
[
i
,
j
]]
_dist_inputs
[
_dist_inputs
==
0
]
=
19
# golden标签
_grid_labels
=
np
.
zeros
((
length
,
length
),
dtype
=
np
.
int
)
_grid_mask2d
=
np
.
ones
((
length
,
length
),
dtype
=
np
.
bool
)
for
entity
in
d
:
e_start
,
e_end
,
e_type
=
entity
[
0
],
entity
[
1
]
+
1
,
entity
[
-
1
]
if
e_end
>=
maxlen
-
2
:
continue
index
=
list
(
range
(
e_start
,
e_end
))
for
i
in
range
(
len
(
index
)):
if
i
+
1
>=
len
(
index
):
break
_grid_labels
[
index
[
i
],
index
[
i
+
1
]]
=
1
_grid_labels
[
index
[
-
1
],
index
[
0
]]
=
categories
[
e_type
]
_entity_text
=
set
([
convert_index_to_text
(
list
(
range
(
e
[
0
],
e
[
1
]
+
1
)),
categories
[
e
[
-
1
]])
for
e
in
d
])
return
tokens_ids
,
_pieces2word
,
_dist_inputs
,
_grid_labels
,
_grid_mask2d
,
_entity_text
corpus
=
load_data
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
)
start
=
time
.
time
()
train_samples
=
parallel_apply
(
func
=
func
,
iterable
=
corpus
,
workers
=
8
,
max_queue_size
=
2000
,
dummy
=
False
,
# windows设置为True使用多进程
callback
=
None
,
unordered
=
False
)
print
(
time
.
time
()
-
start
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/basic/basic_test_tokenizer.py
0 → 100644
View file @
0e29b9b7
# 测试tokenizer和transformers自带的tokenizer是否一致,测试后是一致的
from
transformers
import
BertTokenizer
,
XLNetTokenizer
,
XLNetTokenizerFast
from
bert4torch.tokenizers
import
Tokenizer
,
SpTokenizer
from
tqdm
import
tqdm
choice
=
1
if
choice
:
print
(
'Test BertTokenizer'
)
tokenizer_transformers
=
BertTokenizer
.
from_pretrained
(
"F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
)
tokenizer_bert4torch
=
Tokenizer
(
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
,
do_lower_case
=
True
,
do_tokenize_unk
=
True
)
else
:
print
(
'Test SpTokenizer'
)
tokenizer_transformers
=
XLNetTokenizer
.
from_pretrained
(
"F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base"
)
# tokenizer_transformers = XLNetTokenizerFast.from_pretrained("F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base") # fast版本有些许不一样
tokenizer_bert4torch
=
tokenizer
=
SpTokenizer
(
'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base/spiece.model'
,
token_start
=
None
,
token_end
=
None
)
with
open
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
):
l
=
l
.
split
(
'
\t
'
)[
0
].
strip
()
tokens1
=
tokenizer_transformers
.
tokenize
(
l
)
tokens2
=
tokenizer_bert4torch
.
tokenize
(
l
)
tokens2
=
tokens2
[
1
:
-
1
]
if
choice
==
1
else
tokens2
if
tokens1
!=
tokens2
:
print
(
''
.
join
(
tokens1
))
print
(
''
.
join
(
tokens2
))
print
(
'------------------------------'
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/PLM_config.md
0 → 100644
View file @
0e29b9b7
# 预训练权重config
记录bert4torch需要另外配置的config,部分权重可在转换脚本中查看
----
-
xlnet/[hit_torch_base]--chinese-xlnet-base
```
json
{
"architectures"
:
[
"XLNetLMHeadModel"
],
"attn_type"
:
"bi"
,
"bi_data"
:
false
,
"bos_token_id"
:
1
,
"clamp_len"
:
-1
,
"intermediate_size"
:
3072
,
"hidden_size"
:
768
,
"hidden_dropout_prob"
:
0.1
,
"end_n_top"
:
5
,
"eos_token_id"
:
2
,
"hidden_act"
:
"relu"
,
"initializer_range"
:
0.02
,
"layer_norm_eps"
:
1e-12
,
"mem_len"
:
null
,
"model_type"
:
"xlnet"
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
12
,
"output_past"
:
true
,
"pad_token_id"
:
5
,
"reuse_len"
:
null
,
"same_length"
:
false
,
"start_n_top"
:
5
,
"summary_activation"
:
"tanh"
,
"summary_last_hidden_dropout_prob"
:
0.1
,
"summary_type"
:
"last"
,
"summary_use_proj"
:
true
,
"untie_r"
:
true
,
"vocab_size"
:
32000
}
```
-
gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768
```
json
{
"hidden_act"
:
"swish"
,
"hidden_size"
:
768
,
"hidden_dropout_prob"
:
0.1
,
"attention_probs_dropout_prob"
:
0.1
,
"num_attention_heads"
:
1
,
"attention_key_size"
:
128
,
"intermediate_size"
:
1536
,
"num_hidden_layers"
:
24
,
"type_vocab_size"
:
2
,
"vocab_size"
:
12000
}
```
-
gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base
```
json
{
"attention_probs_dropout_prob"
:
0.1
,
"directionality"
:
"bidi"
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"max_position_embeddings"
:
513
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
12
,
"vocab_size"
:
13088
,
"type_vocab_size"
:
3
,
"shared_segment_embeddings"
:
true
}
```
-
gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b
```
json
{
"vocab_size"
:
30000
,
"hidden_size"
:
2560
,
"attention_probs_dropout_prob"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu"
,
"initializer_range"
:
0.014142135623731
,
"intermediate_size"
:
10240
,
"max_position_embeddings"
:
1024
,
"num_attention_heads"
:
32
,
"num_hidden_layers"
:
32
}
```
-
gpt2/[gpt2-ml_torch_15g]
```
json
{
"vocab_size"
:
21130
,
"hidden_size"
:
1536
,
"attention_probs_dropout_prob"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu"
,
"initializer_range"
:
0.014142135623731
,
"intermediate_size"
:
6144
,
"max_position_embeddings"
:
1024
,
"num_attention_heads"
:
24
,
"num_hidden_layers"
:
48
}
```
-
t5/[google_mt5_torch_base]
```
json
{
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu_new"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
2048
,
"max_position_embeddings"
:
512
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
12
,
"type_vocab_size"
:
2
,
"vocab_size"
:
250112
,
"relative_attention_num_buckets"
:
32
,
"attention_scale"
:
false
,
"is_dropout"
:
true
}
```
-
t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall
```
json
{
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"relu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"max_position_embeddings"
:
512
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
12
,
"type_vocab_size"
:
2
,
"vocab_size"
:
21228
,
"relative_attention_num_buckets"
:
32
,
"attention_scale"
:
false
,
"is_dropout"
:
true
}
```
-
t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall
```
json
{
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"relu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
512
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
2048
,
"max_position_embeddings"
:
512
,
"num_attention_heads"
:
8
,
"num_hidden_layers"
:
6
,
"type_vocab_size"
:
2
,
"vocab_size"
:
21228
,
"relative_attention_num_buckets"
:
32
,
"attention_scale"
:
false
,
"is_dropout"
:
true
}
```
-
t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small
```
json
{
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
512
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
1024
,
"num_attention_heads"
:
6
,
"attention_head_size"
:
64
,
"num_hidden_layers"
:
8
,
"vocab_size"
:
50000
,
"relative_attention_num_buckets"
:
32
,
"attention_scale"
:
false
,
"is_dropout"
:
true
}
```
-
t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base
```
json
{
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
2048
,
"num_attention_heads"
:
12
,
"attention_head_size"
:
64
,
"num_hidden_layers"
:
12
,
"vocab_size"
:
50000
,
"relative_attention_num_buckets"
:
32
,
"attention_scale"
:
false
,
"is_dropout"
:
true
}
```
-
bart/[FudanNLP_torch_base]
```
json
{
"attention_probs_dropout_prob"
:
0.1
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"max_position_embeddings"
:
512
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
6
,
"type_vocab_size"
:
2
,
"vocab_size"
:
21128
}
```
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_GAU_alpha.py
0 → 100644
View file @
0e29b9b7
# tensorflow权重链接:https://github.com/ZhuiyiTechnology/GAU-alpha
# 这里直接映射到GAU_alpha的结构上了,因此不需要mapping
import
torch
import
tensorflow
as
tf
tf_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-tf]--chinese_GAU-alpha-char_L-24_H-768/bert_model.ckpt'
torch_state_dict
=
{}
ts
=
tf
.
train
.
load_variable
(
tf_path
,
'bert/embeddings/word_embeddings'
)
torch_state_dict
[
'embeddings.word_embeddings.weight'
]
=
torch
.
from_numpy
(
ts
)
torch_state_dict
[
'mlmDecoder.weight'
]
=
torch
.
from_numpy
(
ts
)
ts
=
tf
.
train
.
load_variable
(
tf_path
,
'bert/embeddings/token_type_embeddings'
)
torch_state_dict
[
'embeddings.segment_embeddings.weight'
]
=
torch
.
from_numpy
(
ts
)
for
i
in
range
(
24
):
ts
=
tf
.
train
.
load_variable
(
tf_path
,
f
'GAU_alpha/encoder/layer_
{
i
}
/gau/i_dense/kernel'
)
torch_state_dict
[
f
'encoderLayer.
{
i
}
.gau.i_dense.weight'
]
=
torch
.
from_numpy
(
ts
.
T
)
ts
=
tf
.
train
.
load_variable
(
tf_path
,
f
'GAU_alpha/encoder/layer_
{
i
}
/gau/o_dense/kernel'
)
torch_state_dict
[
f
'encoderLayer.
{
i
}
.gau.o_dense.weight'
]
=
torch
.
from_numpy
(
ts
.
T
)
ts1
=
tf
.
train
.
load_variable
(
tf_path
,
f
'GAU_alpha/encoder/layer_
{
i
}
/gau/q_scaleoffset/gamma'
)
ts2
=
tf
.
train
.
load_variable
(
tf_path
,
f
'GAU_alpha/encoder/layer_
{
i
}
/gau/k_scaleoffset/gamma'
)
ts
=
torch
.
stack
([
torch
.
from_numpy
(
ts1
),
torch
.
from_numpy
(
ts2
)],
dim
=
0
)
torch_state_dict
[
f
'encoderLayer.
{
i
}
.gau.offsetscale.gamma'
]
=
ts
torch
.
save
(
torch_state_dict
,
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
)
# config文件
'''
{
"hidden_act": "swish",
"hidden_size": 768,
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"num_attention_heads": 1,
"attention_key_size": 128,
"intermediate_size": 1536,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 12000
}
'''
\ No newline at end of file
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment