Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
bert
Commits
19a23d09
Commit
19a23d09
authored
Jun 19, 2024
by
wangsen
Browse files
Initial commit
parents
Pipeline
#1247
failed with stages
in 0 seconds
Changes
172
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3783 additions
and
0 deletions
+3783
-0
examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
+107
-0
examples/convert_script/convert_gpt2__gpt2-ml.py
examples/convert_script/convert_gpt2__gpt2-ml.py
+109
-0
examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
+106
-0
examples/convert_script/convert_nezha_gpt_dialog.py
examples/convert_script/convert_nezha_gpt_dialog.py
+75
-0
examples/convert_script/convert_roberta_chess.py
examples/convert_script/convert_roberta_chess.py
+80
-0
examples/convert_script/convert_t5_pegasus.py
examples/convert_script/convert_t5_pegasus.py
+107
-0
examples/convert_script/convert_transformer_xl.py
examples/convert_script/convert_transformer_xl.py
+104
-0
examples/others/task_conditional_language_model.py
examples/others/task_conditional_language_model.py
+177
-0
examples/others/task_iflytek_bert_of_theseus.py
examples/others/task_iflytek_bert_of_theseus.py
+212
-0
examples/others/task_language_model.py
examples/others/task_language_model.py
+175
-0
examples/others/task_language_model_chinese_chess.py
examples/others/task_language_model_chinese_chess.py
+214
-0
examples/others/task_nl2sql_baseline.py
examples/others/task_nl2sql_baseline.py
+380
-0
examples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
examples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
+151
-0
examples/pretrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
...retrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
+223
-0
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
+264
-0
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
+287
-0
examples/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
...les/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
+164
-0
examples/relation_extraction/task_relation_extraction_CasRel.py
...es/relation_extraction/task_relation_extraction_CasRel.py
+318
-0
examples/relation_extraction/task_relation_extraction_gplinker.py
.../relation_extraction/task_relation_extraction_gplinker.py
+248
-0
examples/relation_extraction/task_relation_extraction_tplinker.py
.../relation_extraction/task_relation_extraction_tplinker.py
+282
-0
No files found.
examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 将清华开源的中文GPT2模型(26亿参数)
# 项目链接(tf版本):https://github.com/TsinghuaAI/CPM-Generate
# pytorch版权重下载链接:https://huggingface.co/TsinghuaAI/CPM-Generate,经过本脚本转成bert4torch适用的权重
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
f
'
{
ckpt_dir
}
/bert4torch_pytorch_model.bin'
num_hidden_layers
=
32
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt2'
w
=
torch_weights
[
'transformer.wte.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'transformer.wpe.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'transformer.h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'transformer.h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'transformer.h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'transformer.h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
# layernorm_final
w
=
torch_weights
[
'transformer.ln_f.weight'
]
new_weights
[
f
'
{
prefix
}
.LayerNormFinal.weight'
]
=
w
b
=
torch_weights
[
'transformer.ln_f.bias'
]
new_weights
[
f
'
{
prefix
}
.LayerNormFinal.bias'
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"vocab_size": 30000,
"hidden_size": 2560,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 10240,
"max_position_embeddings": 1024,
"num_attention_heads": 32,
"num_hidden_layers": 32
}
'''
\ No newline at end of file
examples/convert_script/convert_gpt2__gpt2-ml.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# gpt2-ml
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# pytorch权重转换和下载:https://github.com/ghosthamlet/gpt2-ml-torch
# 最后经过本脚本转成bert4torch适用的权重
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
f
'
{
ckpt_dir
}
/bert4torch_pytorch_model.bin'
num_hidden_layers
=
48
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt2_ml'
w
=
torch_weights
[
'wte.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'wpe.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
# embedding layernorm
w
=
torch_weights
[
'emb_norm.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.LayerNorm.weight'
]
=
w
b
=
torch_weights
[
'emb_norm.bias'
]
new_weights
[
f
'
{
prefix
}
.embeddings.LayerNorm.bias'
]
=
b
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
'''
\ No newline at end of file
examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 将CDial-GPT的pytorch权重转换为bert4torch可适配的权重,base和large都可转换
# 项目链接(torch版本):https://github.com/thu-coai/CDial-GPT
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
num_hidden_layers
=
12
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt'
# CDial-GPT的[CLS]是0、[PAD]是1,不符合一般习惯,所以交换一下
w
=
torch_weights
[
'transformer.tokens_embed.weight'
]
w
=
torch
.
cat
([
w
[
1
:
2
],
w
[:
1
],
w
[
2
:]],
axis
=
0
)
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'transformer.positions_embed.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'transformer.h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'transformer.h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'transformer.h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'transformer.h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 513,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 13088,
"type_vocab_size": 3,
"shared_segment_embeddings": true
}
'''
\ No newline at end of file
examples/convert_script/convert_nezha_gpt_dialog.py
0 → 100644
View file @
19a23d09
# NEZHA模型做闲聊任务,苏神已经finetune好的权重,注意不是预训练模型
# 源项目:https://github.com/bojone/nezha_gpt_dialog
import
torch
import
tensorflow
as
tf
tf_path
=
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/model.ckpt'
torch_state_dict
=
{}
prefix
=
'bert'
mapping
=
{
'bert/embeddings/word_embeddings'
:
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
,
'bert/embeddings/token_type_embeddings'
:
f
'
{
prefix
}
.embeddings.token_type_embeddings.weight'
,
'bert/embeddings/LayerNorm/beta'
:
f
'
{
prefix
}
.embeddings.LayerNorm.bias'
,
'bert/embeddings/LayerNorm/gamma'
:
f
'
{
prefix
}
.embeddings.LayerNorm.weight'
,
'cls/predictions/transform/dense/kernel'
:
'cls.predictions.transform.dense.weight##'
,
'cls/predictions/transform/dense/bias'
:
'cls.predictions.transform.dense.bias'
,
'cls/predictions/transform/LayerNorm/beta'
:
'cls.predictions.transform.LayerNorm.bias'
,
'cls/predictions/transform/LayerNorm/gamma'
:
'cls.predictions.transform.LayerNorm.weight'
,
'cls/predictions/output_bias'
:
'cls.predictions.bias'
}
for
i
in
range
(
12
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
mapping
.
update
({
f
'bert/encoder/layer_
{
i
}
/attention/self/query/kernel'
:
prefix_i
+
'attention.self.query.weight##'
,
# 转置标识
f
'bert/encoder/layer_
{
i
}
/attention/self/query/bias'
:
prefix_i
+
'attention.self.query.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/key/kernel'
:
prefix_i
+
'attention.self.key.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/key/bias'
:
prefix_i
+
'attention.self.key.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/value/kernel'
:
prefix_i
+
'attention.self.value.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/value/bias'
:
prefix_i
+
'attention.self.value.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/dense/kernel'
:
prefix_i
+
'attention.output.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/dense/bias'
:
prefix_i
+
'attention.output.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/LayerNorm/beta'
:
prefix_i
+
'attention.output.LayerNorm.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/LayerNorm/gamma'
:
prefix_i
+
'attention.output.LayerNorm.weight'
,
f
'bert/encoder/layer_
{
i
}
/intermediate/dense/kernel'
:
prefix_i
+
'intermediate.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/intermediate/dense/bias'
:
prefix_i
+
'intermediate.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/dense/kernel'
:
prefix_i
+
'output.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/output/dense/bias'
:
prefix_i
+
'output.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/LayerNorm/beta'
:
prefix_i
+
'output.LayerNorm.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/LayerNorm/gamma'
:
prefix_i
+
'output.LayerNorm.weight'
})
for
key
,
value
in
mapping
.
items
():
ts
=
tf
.
train
.
load_variable
(
tf_path
,
key
)
if
value
.
endswith
(
'##'
):
value
=
value
.
replace
(
'##'
,
''
)
torch_state_dict
[
value
]
=
torch
.
from_numpy
(
ts
).
T
else
:
torch_state_dict
[
value
]
=
torch
.
from_numpy
(
ts
)
torch_state_dict
[
'cls.predictions.decoder.weight'
]
=
torch_state_dict
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
torch_state_dict
[
'cls.predictions.decoder.bias'
]
=
torch_state_dict
[
'cls.predictions.bias'
]
torch
.
save
(
torch_state_dict
,
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin'
)
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"max_relative_position": 64,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 14195,
"use_relative_position": true
}
'''
examples/convert_script/convert_roberta_chess.py
0 → 100644
View file @
19a23d09
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 只是转换苏神已经train好的模型,注意不是预训练模型
import
numpy
as
np
import
h5py
import
torch
# 这里用的keras==2.3.1
from
keras.engine
import
saving
tf_path
=
'E:/Github/bert4keras/examples/best_model_chess.weights'
torch_state_dict
=
{}
# 1表示transpose, 0表示不变
key_map
=
{
'Embedding-Token/embeddings:0'
:
[
'embeddings.word_embeddings.weight'
,
0
],
'Embedding-Segment/embeddings:0'
:
[
'embeddings.segment_embeddings.weight'
,
0
],
'Embedding-Position/embeddings:0'
:
[
'embeddings.position_embeddings.weight'
,
0
],
'Embedding-Norm/gamma:0'
:
[
'embeddings.layerNorm.weight'
,
0
],
'Embedding-Norm/beta:0'
:
[
'embeddings.layerNorm.bias'
,
0
],
'MLM-Dense/kernel:0'
:
[
'mlmDense.weight'
,
1
],
'MLM-Dense/bias:0'
:
[
'mlmDense.bias'
,
0
],
'MLM-Norm/gamma:0'
:
[
'mlmLayerNorm.weight'
,
0
],
'MLM-Norm/beta:0'
:
[
'mlmLayerNorm.bias'
,
0
],
'MLM-Bias/bias:0'
:
[
'mlmBias'
,
0
],
}
for
i
in
range
(
12
):
key_map
.
update
({
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
1
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
1
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
2
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
2
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
3
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
3
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
4
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
4
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention-Norm/gamma:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm1.weight'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention-Norm/beta:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm1.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
5
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.weight'
,
1
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
5
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
6
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.outputDense.weight'
,
1
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
6
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.outputDense.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward-Norm/gamma:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm2.weight'
,
0
],
f
'Transformer-
{
i
}
-FeedForward-Norm/beta:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm2.bias'
,
0
],
})
consume_keys
=
set
()
with
h5py
.
File
(
tf_path
,
mode
=
'r'
)
as
f
:
if
'layer_names'
not
in
f
.
attrs
and
'model_weights'
in
f
:
f
=
f
[
'model_weights'
]
layer_names
=
saving
.
load_attributes_from_hdf5_group
(
f
,
'layer_names'
)
weight_value_tuples
=
[]
for
k
,
name
in
enumerate
(
layer_names
):
g
=
f
[
name
]
weight_names
=
saving
.
load_attributes_from_hdf5_group
(
g
,
'weight_names'
)
weight_values
=
[
np
.
asarray
(
g
[
weight_name
])
for
weight_name
in
weight_names
]
for
i
,
weight_name
in
enumerate
(
weight_names
):
new_key
=
key_map
[
weight_name
][
0
]
if
key_map
[
weight_name
][
1
]
==
1
:
# transpose
torch_state_dict
[
new_key
]
=
torch
.
from_numpy
(
weight_values
[
i
]).
T
else
:
torch_state_dict
[
new_key
]
=
torch
.
from_numpy
(
weight_values
[
i
])
assert
new_key
not
in
consume_keys
,
'duplicate keys'
consume_keys
.
add
(
new_key
)
if
hasattr
(
f
,
'close'
):
f
.
close
()
elif
hasattr
(
f
.
file
,
'close'
):
f
.
file
.
close
()
torch_state_dict
[
'mlmDecoder.weight'
]
=
torch_state_dict
[
'embeddings.word_embeddings.weight'
]
torch_state_dict
[
'mlmDecoder.bias'
]
=
torch_state_dict
[
'mlmBias'
]
# for k, v in torch_state_dict.items():
# print(k, v.shape)
torch
.
save
(
torch_state_dict
,
'E:/Github/bert4torch/examples/others/best_model_chess.pt'
)
examples/convert_script/convert_t5_pegasus.py
0 → 100644
View file @
19a23d09
# t5_pegasus从tf转为bert4torch适配的pytorch版本
# 权重链接:https://github.com/ZhuiyiTechnology/t5-pegasus
import
torch
import
tensorflow
as
tf
import
json
# small
tf_dir
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_small]--chinese_t5_pegasus_small/'
torch_path
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/pytorch_model.bin'
# base:
# tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_base]--chinese_t5_pegasus_base/'
# torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/pytorch_model.bin'
tf_path
=
tf_dir
+
'model.ckpt'
with
open
(
tf_dir
+
'config.json'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
config
=
json
.
load
(
f
)
num_layers
=
config
[
'num_hidden_layers'
]
torch_state_dict
=
{}
mapping
=
{
'shared/embedding'
:
'shared.weight'
,
'encoder/block_000/layer_000/SelfAttention/relative_attention_bias'
:
'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T'
,
# 自定义标记,##T结尾表示要转置
'encoder/rms_norm/scale'
:
'encoder.final_layer_norm.weight'
,
'decoder/block_000/layer_000/SelfAttention/relative_attention_bias'
:
'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T'
,
'decoder/rms_norm/scale'
:
'decoder.final_layer_norm.weight'
,
'decoder/logits/kernel'
:
'lm_head.weight##T'
}
for
i
in
range
(
num_layers
):
i1
=
str
(
i
).
rjust
(
3
,
'0'
)
mapping
.
update
({
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/q'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.q.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/k'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.k.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/v'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.v.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/o'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.o.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/rms_norm/scale'
:
f
'encoder.block.
{
i
}
.layer.0.layer_norm.weight'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wi_0/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wi_0.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wi_1/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wi_1.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wo/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wo.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/rms_norm/scale'
:
f
'encoder.block.
{
i
}
.layer.1.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/q'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.q.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/k'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.k.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/v'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.v.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/o'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.o.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.0.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/q'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.q.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/k'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.k.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/v'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.v.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/o'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.o.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.1.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wi_0/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wi_0.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wi_1/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wi_1.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wo/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wo.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.2.layer_norm.weight'
,
})
transpose_layers
=
[
''
]
for
k
,
v
in
mapping
.
items
():
ts
=
torch
.
from_numpy
(
tf
.
train
.
load_variable
(
tf_path
,
k
))
# if len(ts.shape)==2 and ts.shape[0] == ts.shape[1]:
# print(k, v)
if
v
.
endswith
(
'##T'
):
torch_state_dict
[
v
.
rstrip
(
'##T'
)]
=
ts
.
T
else
:
torch_state_dict
[
v
]
=
ts
torch
.
save
(
torch_state_dict
,
torch_path
)
# config文件
'''
# base版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"num_attention_heads": 12,
"attention_head_size": 64,
"num_hidden_layers": 12,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
# small版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"num_attention_heads": 6,
"attention_head_size": 64,
"num_hidden_layers": 8,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
'''
\ No newline at end of file
examples/convert_script/convert_transformer_xl.py
0 → 100644
View file @
19a23d09
# 权重链接:https://huggingface.co/transfo-xl-wt103
# 该项目是英文的:只用于bert4torch中transformer_xl的调试模型结构,并未实际用于finetune
import
torch
ckpt_file
=
'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/pytorch_model.bin'
torch_state_dict
=
{}
# 1表示transpose, 0表示不变
key_map
=
{
'transformer.word_emb.emb_layers.0.weight'
:
'embeddings.emb_layers.0.weight'
,
'transformer.word_emb.emb_layers.1.weight'
:
'embeddings.emb_layers.1.weight'
,
'transformer.word_emb.emb_layers.2.weight'
:
'embeddings.emb_layers.2.weight'
,
'transformer.word_emb.emb_layers.3.weight'
:
'embeddings.emb_layers.3.weight'
,
'transformer.word_emb.emb_projs.0'
:
'embeddings.emb_projs.0'
,
'transformer.word_emb.emb_projs.1'
:
'embeddings.emb_projs.1'
,
'transformer.word_emb.emb_projs.2'
:
'embeddings.emb_projs.2'
,
'transformer.word_emb.emb_projs.3'
:
'embeddings.emb_projs.3'
,
}
for
i
in
range
(
18
):
key_map
.
update
({
f
'transformer.layers.
{
i
}
.dec_attn.r_r_bias'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r_r_bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.r_w_bias'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r_w_bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.o_net.weight'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.weight'
,
f
'transformer.layers.
{
i
}
.dec_attn.layer_norm.weight'
:
f
'encoderLayer.
{
i
}
.layerNorm1.weight'
,
f
'transformer.layers.
{
i
}
.dec_attn.layer_norm.bias'
:
f
'encoderLayer.
{
i
}
.layerNorm1.bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.r_net.weight'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.0.weight'
:
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.0.bias'
:
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.bias'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.3.weight'
:
f
'encoderLayer.
{
i
}
.feedForward.outputDense.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.3.bias'
:
f
'encoderLayer.
{
i
}
.feedForward.outputDense.bias'
,
f
'transformer.layers.
{
i
}
.pos_ff.layer_norm.weight'
:
f
'encoderLayer.
{
i
}
.layerNorm2.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.layer_norm.bias'
:
f
'encoderLayer.
{
i
}
.layerNorm2.bias'
,
})
torch_weights
=
torch
.
load
(
ckpt_file
)
model_new
=
{}
for
key
,
value
in
key_map
.
items
():
model_new
[
value
]
=
torch_weights
[
key
]
for
i
in
range
(
18
):
qkv_net
=
torch_weights
[
f
'transformer.layers.
{
i
}
.dec_attn.qkv_net.weight'
]
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.weight'
],
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.weight'
],
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.weight'
]
=
qkv_net
.
chunk
(
3
,
dim
=
0
)
torch
.
save
(
model_new
,
'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/bert4torch_pytorch_model.bin'
)
# config文件
'''
{
"adaptive": true,
"architectures": [
"TransfoXLLMHeadModel"
],
"attn_type": 0,
"clamp_len": 1000,
"cutoffs": [
20000,
40000,
200000
],
"d_embed": 1024,
"d_head": 64,
"intermediate_size": 4096,
"hidden_size": 1024,
"div_val": 4,
"is_dropout": true,
"adaptive_embedding": true,
"attention_probs_dropout_prob": 0.0,
"hidden_dropout_prob": 0.1,
"hidden_act": "relu",
"eos_token_id": 0,
"ext_len": 0,
"init": "normal",
"init_range": 0.01,
"init_std": 0.02,
"layer_norm_epsilon": 1e-05,
"mem_len": 1600,
"model_type": "transfo-xl",
"num_attention_heads": 16,
"num_hidden_layers": 18,
"pre_lnorm": false,
"proj_init_std": 0.01,
"same_length": true,
"sample_softmax": -1,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 250
}
},
"tgt_len": 128,
"tie_projs": [
false,
true,
true,
true
],
"tie_weight": true,
"untie_r": true,
"vocab_size": 267735
}
'''
\ No newline at end of file
examples/others/task_conditional_language_model.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# bert做conditional language model任务
# 按类随机生成文本,这个demo的类别是情感极性(正/负)
# 请参考:https://kexue.fm/archives/7124
from
pydantic
import
NoneStrBytes
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
,
Callback
,
AutoRegressiveDecoder
,
ListDataset
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
import
torch.optim
as
optim
import
torch.nn
as
nn
# 模型配置
maxlen
=
128
batch_size
=
16
num_classes
=
2
epochs
=
20
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
# if len(D) >= 100:
# break
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
(
label
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
,
batch_labels
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
,
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
,
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
c
=
nn
.
Embedding
(
num_classes
,
128
)
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'lm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
layer_norm_cond
=
c
,
ignore_invalid_weights
=
True
)
# 忽略未初始化的权重
def
forward
(
self
,
inputs
):
_
,
seq_output
=
self
.
bert
(
inputs
)
# [btz, seq_len, vocab_size]
return
seq_output
model
=
Model
().
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
input
,
target
):
input
=
input
[:,
:
-
1
,
:].
reshape
(
-
1
,
input
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
input
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
RandomSentiment
(
AutoRegressiveDecoder
):
"""根据情感标签(0:负,1:正)随机生成一批句子
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
output_ids
segment_ids
=
torch
.
zeros_like
(
token_ids
,
device
=
device
)
label
=
inputs
[
0
]
return
model
.
predict
([
token_ids
,
segment_ids
,
label
])[:,
-
1
,
:]
def
generate
(
self
,
label
,
n
=
1
,
topp
=
0.95
):
results
=
self
.
random_sample
([[
label
]],
n
,
topp
=
topp
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
random_sentiment
=
RandomSentiment
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
just_show
():
print
(
u
'正面采样:'
)
print
(
random_sentiment
.
generate
(
1
,
5
,
0.95
),
'
\n
'
)
print
(
u
'负面采样:'
)
print
(
random_sentiment
.
generate
(
0
,
5
,
0.95
),
'
\n
'
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
"""
正面采样:
[
u'外观时尚、漂亮、性价比高。',
u'外观漂亮,配置均衡,比较满意,性价比高,外观漂亮,性能较高。',
u'我是在大学的时候看到这本书的,所以一直在买。书中的作者是林静蕾,她用自己的口吻写出了一个孩子成长中的心路历程,让我看到了她们成长中的不同之处,以及她们成长过程中的不同境界。让我很欣赏!',
u'我想这是一本能够告诉读者什么是坏的,而不是教你怎样说话,告诉我什么是错。这里我推荐了《我要讲故事》,这本书是我很喜欢的一本书,我认为它的理由很多,但是,我相信我。如果你从中得到一些改进,或者你已经有了一个明智的决定。',
u'我们一家五口住的是标间,大床房,大床的床很舒服;而我们在携程网上订了两套大床房,这个酒店的价格还是比较合理的;但是房间的隔音效果不太理想,有点响的声音;酒店门口的地铁在施工中,不方便;但是酒店的门口的出租车不知道是哪个车的,打车不是很方便;酒店外面的停'
]
负面采样:
[
u'不知道是不是因为电池不太好,不是我不喜欢。',
u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
u'1、外壳不容易沾手印,不容易洗洗2、屏幕有点旧, 不能下载铃声',
u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款,为什么订单下了两周多至今还未到货?是收货时间太快了,可能就这么过去了吧?',
u'这本书我是在网上先看了一遍,后来我再看了一遍。感觉作者的文笔实在太烂了,特别是在写他的博客时特别别扭,写得很不专业,特别是他写股票时那个情绪调节的小男孩,简直就是自作聪明的样子,简直就是自作聪明的一种表现!'
]
"""
examples/others/task_iflytek_bert_of_theseus.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 文本分类例子下的模型压缩
# 方法为BERT-of-Theseus
# 论文:https://arxiv.org/abs/2002.02925
# 博客:https://kexue.fm/archives/7575
import
json
from
bert4torch.models
import
build_transformer_model
,
BaseModel
,
BERT
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.layers
import
BertLayer
import
torch
from
torch.utils.data
import
DataLoader
,
Dataset
import
torch.nn
as
nn
import
torch.optim
as
optim
from
tqdm
import
tqdm
from
torchinfo
import
summary
import
copy
from
torch.distributions.bernoulli
import
Bernoulli
num_classes
=
119
maxlen
=
128
batch_size
=
32
replacing_rate
=
0.5
steps_for_replacing
=
2000
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式: (文本, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
i
,
l
in
enumerate
(
f
):
l
=
json
.
loads
(
l
)
text
,
label
=
l
[
'sentence'
],
l
[
'label'
]
D
.
append
((
text
,
int
(
label
)))
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/dev.json'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
class
BERT_THESEUS
(
BERT
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
layer
=
BertLayer
(
self
.
hidden_size
,
self
.
num_attention_heads
,
self
.
dropout_rate
,
self
.
attention_probs_dropout_prob
,
self
.
intermediate_size
,
self
.
hidden_act
,
is_dropout
=
False
,
conditional_size
=
self
.
conditional_size
)
self
.
encoderLayer
=
nn
.
ModuleList
(
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
self
.
num_hidden_layers
)]))
self
.
scc_n_layer
=
6
# 蒸馏到6层
self
.
scc_layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
self
.
scc_n_layer
)])
self
.
compress_ratio
=
self
.
num_hidden_layers
//
self
.
scc_n_layer
self
.
bernoulli
=
None
def
set_replacing_rate
(
self
,
replacing_rate
):
if
not
0
<
replacing_rate
<=
1
:
raise
Exception
(
'Replace rate must be in the range (0, 1]!'
)
self
.
bernoulli
=
Bernoulli
(
torch
.
tensor
([
replacing_rate
]))
def
apply_main_layers
(
self
,
inputs
):
"""BERT的主体是基于Self-Attention的模块
顺序:Att --> Add --> LN --> FFN --> Add --> LN
"""
hidden_states
,
attention_mask
,
conditional_emb
=
inputs
encoded_layers
=
[
hidden_states
]
# 添加embedding的输出
if
self
.
training
:
inference_layers
=
[]
for
i
in
range
(
self
.
scc_n_layer
):
if
self
.
bernoulli
.
sample
()
==
1
:
# REPLACE
inference_layers
.
append
(
self
.
scc_layer
[
i
])
else
:
# KEEP the original
for
offset
in
range
(
self
.
compress_ratio
):
inference_layers
.
append
(
self
.
encoderLayer
[
i
*
self
.
compress_ratio
+
offset
])
else
:
# inference with compressed model
inference_layers
=
self
.
scc_layer
# forward
for
i
,
layer_module
in
enumerate
(
inference_layers
):
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
,
conditional_emb
)
if
self
.
output_all_encoded_layers
:
encoded_layers
.
append
(
hidden_states
)
if
not
self
.
output_all_encoded_layers
:
encoded_layers
.
append
(
hidden_states
)
return
[
encoded_layers
,
conditional_emb
]
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
BERT_THESEUS
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
num_classes
)
def
forward
(
self
,
token_ids
,
segment_ids
):
encoded_layers
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dense
(
encoded_layers
[:,
0
,
:])
# 取第1个位置
return
output
model
=
Model
().
to
(
device
)
summary
(
model
,
input_data
=
next
(
iter
(
train_dataloader
))[
0
])
# replacing策略
class
ConstantReplacementScheduler
:
def
__init__
(
self
,
bert_encoder
,
replacing_rate
,
replacing_steps
=
None
):
self
.
bert_encoder
=
bert_encoder
self
.
replacing_rate
=
replacing_rate
self
.
replacing_steps
=
replacing_steps
self
.
step_counter
=
0
self
.
bert_encoder
.
set_replacing_rate
(
replacing_rate
)
def
step
(
self
):
self
.
step_counter
+=
1
if
self
.
replacing_steps
is
None
or
self
.
replacing_rate
==
1.0
:
return
self
.
replacing_rate
else
:
if
self
.
step_counter
>=
self
.
replacing_steps
:
self
.
bert_encoder
.
set_replacing_rate
(
1.0
)
self
.
replacing_rate
=
1.0
return
self
.
replacing_rate
class
LinearReplacementScheduler
:
def
__init__
(
self
,
bert_encoder
,
base_replacing_rate
,
k
):
self
.
bert_encoder
=
bert_encoder
self
.
base_replacing_rate
=
base_replacing_rate
self
.
step_counter
=
0
self
.
k
=
k
self
.
bert_encoder
.
set_replacing_rate
(
base_replacing_rate
)
def
step
(
self
):
self
.
step_counter
+=
1
current_replacing_rate
=
min
(
self
.
k
*
self
.
step_counter
+
self
.
base_replacing_rate
,
1.0
)
self
.
bert_encoder
.
set_replacing_rate
(
current_replacing_rate
)
return
current_replacing_rate
replacing_rate_scheduler
=
ConstantReplacementScheduler
(
bert_encoder
=
model
.
bert
,
replacing_rate
=
replacing_rate
,
replacing_steps
=
steps_for_replacing
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
scheduler
=
replacing_rate_scheduler
,
metrics
=
[
'accuracy'
])
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
u
'val_acc: %.5f, best_val_acc: %.5f
\n
'
%
(
val_acc
,
self
.
best_val_acc
))
def
predict_to_file
(
in_file
,
out_file
):
"""输出预测结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw
=
open
(
out_file
,
'w'
)
with
open
(
in_file
)
as
fr
:
for
l
in
tqdm
(
fr
):
l
=
json
.
loads
(
l
)
text
=
l
[
'sentence'
]
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
label
=
model
.
predict
([[
token_ids
],
[
segment_ids
]])[
0
].
argmax
()
l
=
json
.
dumps
({
'id'
:
str
(
l
[
'id'
]),
'label'
:
str
(
label
)})
fw
.
write
(
l
+
'
\n
'
)
fw
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
# predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
examples/others/task_language_model.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# bert做language model任务,小说生成
import
glob
,
re
from
tqdm
import
tqdm
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
AutoRegressiveDecoder
,
Callback
,
ListDataset
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
maxlen
=
256
batch_size
=
8
epochs
=
10000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
novels
=
[]
for
txt
in
glob
.
glob
(
filenames
):
txt
=
open
(
txt
,
encoding
=
'utf-8'
).
read
()
txt
=
txt
.
replace
(
'
\r
'
,
''
).
replace
(
'
\n
'
,
''
)
txt
=
txt
.
replace
(
u
'整理制作,并提供下载'
,
''
)
txt
=
re
.
sub
(
u
'www.*?com'
,
''
,
txt
)
txt
=
txt
.
replace
(
u
'
\u3000
'
,
' '
)
sents
=
[]
for
t
in
txt
.
split
(
' '
):
for
s
in
re
.
findall
(
u
'.*?。'
,
t
):
if
len
(
s
)
<=
maxlen
-
2
:
sents
.
append
(
s
)
novels
.
append
(
sents
)
data
=
[]
pbar
=
tqdm
(
desc
=
u
'构建语料中'
,
total
=
sum
(
len
(
n
)
for
n
in
novels
))
for
novel
in
novels
:
s
=
u
''
for
i
in
range
(
len
(
novel
)):
for
j
in
range
(
len
(
novel
)
-
i
):
if
len
(
s
)
+
len
(
novel
[
i
+
j
])
>
maxlen
-
2
:
data
.
append
(
s
)
s
=
u
''
break
else
:
s
+=
novel
[
i
+
j
]
pbar
.
update
(
1
)
if
i
+
j
>=
len
(
novel
):
break
if
s
:
data
.
append
(
s
)
pbar
.
close
()
return
data
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
text
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/pretrain/金庸小说/*.txt'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建模
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'lm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
_
,
mlm_scores
=
outputs
mlm_scores
=
mlm_scores
[:,
:
-
1
,
:].
reshape
(
-
1
,
mlm_scores
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
mlm_scores
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
# 随机采样
class
StoryCompletion
(
AutoRegressiveDecoder
):
"""基于随机采样的故事续写
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
inputs
[
0
]
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
zeros_like
(
token_ids
,
device
=
device
)
_
,
mlm_scores
=
model
.
predict
([
token_ids
,
segment_ids
])
return
mlm_scores
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topp
=
0.95
):
token_ids
,
_
=
tokenizer
.
encode
(
text
)
results
=
self
.
random_sample
([
token_ids
[:
-
1
]],
n
,
topp
=
topp
)
# 基于随机采样
return
[
text
+
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
story_completion
=
StoryCompletion
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
just_show
():
s1
=
u
'当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。'
s2
=
u
'虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。'
s3
=
u
'杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。'
for
s
in
[
s1
,
s2
,
s3
]:
t
=
story_completion
.
generate
(
s
)
print
(
u
'输入: %s'
%
s
)
print
(
u
'结果: %s
\n
'
%
(
'
\n
'
.
join
(
t
)))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
100
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.weights'
)
"""
效果:
输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。
结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。次日清晨,张无忌便和赵敏去买了一匹高头大马,自己骑了随伴。那马甚有神骏,三十六斤重的身躯之中,竟无一头白马。他心中怦怦乱跳,暗想:若能将赵敏引出迷城,我决不致再和她相会,但若和赵姑娘相遇,我一生一世决计再难相见。何况我是她的私生女儿,这般亲热,岂不是好?我如何能和她相见?今后我要教训教训她才好?我教教她,教训她,要她心里快快活活的。他心如刀割,当即回到客店,将张无忌的所在说了。
输入: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。
结果: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。那矮子见他如此功力,大吃一惊,叫道:什么人?是谁?你干什么?我师父是谁?你们是谁?是谁?你们是谁?我师父是谁?你这矮子,便是段延庆。你们不知道我师父便是,是不是?快快说来。那矮子道:我师父便是延庆太子,他的徒弟也是段延庆。他老人家在唐朝做镇南王,你们便将他改名为延庆太子,叫做延庆太子!这名头倒怪,你们大伙儿听见了,也不知道他老人家是死是活。
输入: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。
结果: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。这时见他手中所握,竟是一柄特制的短剑,心中大喜,叫道::原来是金蛇郎君的剑!原来你便是金蛇郎君的弟子,这一下可要叫我失望了。那人哈哈一笑,说道:好啊!好啊,好啊!我的金蛇剑是我的,不过我是你的。这人道:我姓杨名过,名字叫过。你是我儿子,是我女儿,是不是?你这么大的年纪,怎地自称金刀驸马?我这就给你取个名字,叫作过儿。
"""
examples/others/task_language_model_chinese_chess.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 数据:https://github.com/bojone/gpt_cchess
# 模型训练可以在python2/python3进行。但是cchess模块只支持python3,
# 因此如果需要交互式体验模型棋力,那么需要在python3下进行。
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_roberta_chess.py
import
json
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
Callback
from
cchess
import
*
# 基本信息
maxlen
=
512
steps_per_epoch
=
1000
epochs
=
10000
batch_size
=
16
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取全局棋谱
返回:[(棋谱, 结果)],其中结果等于2为红方赢棋,1为和棋,
0为黑方赢棋,-1则为无明确标注胜负。
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
not
l
[
'fen'
]:
result
=
int
(
l
[
'items'
].
get
(
u
'棋局结果'
,
-
1
))
D
.
append
((
l
[
'iccs'
],
result
))
return
D
# 建立分词器
chars
=
[
u
'[PAD]'
]
+
list
(
u
'0123456789abcdefghi'
)
token_dict
=
dict
(
zip
(
chars
,
range
(
len
(
chars
))))
tokenizer
=
Tokenizer
(
token_dict
)
tokenizer
.
_token_unk_id
=
0
bert_token_dict
=
load_vocab
(
dict_path
)
keep_tokens
=
[
bert_token_dict
[
c
]
for
c
in
chars
]
count
=
0
def
get_count
():
if
count
<
20000
:
n
=
8
elif
count
<
40000
:
n
=
4
elif
count
<
80000
:
n
=
2
else
:
n
=
1
return
n
def
collate_fn
(
batch
):
"""数据生成器
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
text
,
_
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
' '
.
join
(
text
),
maxlen
=
maxlen
//
get_count
()
+
1
)
batch_token_ids
.
append
([
0
]
+
token_ids
[
1
:
-
1
])
batch_segment_ids
.
append
([
0
]
+
segment_ids
[
1
:
-
1
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
global
count
count
+=
1
return
[
batch_token_ids
,
batch_segment_ids
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/qipu/qipu.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 由于字典中0不代表padding位,为避免attention_mask计算错误,这里token_pad_ids=-100
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
application
=
'lm'
,
with_mlm
=
True
,
keep_tokens
=
keep_tokens
,
token_pad_ids
=-
100
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
_
,
mlm_scores
=
outputs
mlm_scores
=
mlm_scores
[:,
:
-
1
,
:].
reshape
(
-
1
,
mlm_scores
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
mlm_scores
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
ChessPlayer
(
object
):
"""交互式下棋程序
"""
def
move_to_chinese
(
self
,
move
):
"""将单步走法转为中文描述
"""
if
not
isinstance
(
move
,
Move
):
move
=
Move
(
self
.
board
,
move
[
0
],
move
[
1
])
return
move
.
to_chinese
()
def
move_to_iccs
(
self
,
move
):
"""将单步走法转为iccs表示
"""
if
not
isinstance
(
move
,
Move
):
move
=
Move
(
self
.
board
,
move
[
0
],
move
[
1
])
return
move
.
to_iccs
()
def
print_board
(
self
):
"""打印当前棋盘
直观起见,红方用红色表示,黑方用绿色表示。
"""
for
l
in
self
.
board
.
dump_board
():
for
c
in
u
'兵炮车马相仕帅'
:
l
=
l
.
replace
(
c
,
u
'
\033
[1;31;40m%s
\033
[0m'
%
c
)
for
c
in
u
'卒砲砗碼象士将'
:
l
=
l
.
replace
(
c
,
u
'
\033
[1;32;40m%s
\033
[0m'
%
c
)
print
(
l
)
def
movable_steps
(
self
):
"""给出当前局面所有候选走法
"""
return
[
self
.
move_to_iccs
(
m
)
for
m
in
self
.
board
.
create_moves
()]
def
human_input
(
self
):
"""人类行棋
"""
while
True
:
try
:
iccs
=
input
(
u
'请输入iccs棋着: '
)
print
(
iccs
)
move
=
self
.
board
.
move_iccs
(
iccs
)
if
move
is
not
None
:
return
iccs
,
move
except
KeyboardInterrupt
:
return
None
except
:
pass
def
record
(
self
,
iccs
):
"""将局面往前推进一步
"""
self
.
history
+=
iccs
self
.
board
.
next_turn
()
self
.
print_board
()
self
.
current
=
(
self
.
current
+
1
)
%
2
def
new_game
(
self
,
current
=
0
):
"""开新局
"""
self
.
board
=
ChessBoard
()
self
.
board
.
from_fen
(
FULL_INIT_FEN
)
self
.
print_board
()
self
.
history
=
''
self
.
current
=
current
if
self
.
current
==
0
:
# 人类先手
iccs
,
move
=
self
.
human_input
()
self
.
record
(
iccs
)
while
True
:
# 机器走棋
moves
=
self
.
movable_steps
()
iccses
=
[
' '
.
join
(
self
.
history
+
m
)
for
m
in
moves
]
token_ids
=
[[
0
]
+
tokenizer
.
encode
(
ic
)[
0
][
1
:
-
1
]
for
ic
in
iccses
]
token_ids
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
zeros_like
(
token_ids
)
preds
=
model
.
predict
([
token_ids
,
segment_ids
])[
-
1
][:,
-
5
:
-
1
]
preds
=
nn
.
Softmax
(
dim
=-
1
)(
preds
)
preds
=
torch
.
take_along_dim
(
preds
,
token_ids
[:,
-
4
:,
None
],
dim
=
2
)
preds
=
torch
.
log
(
preds
+
1e-8
)[:,
:,
0
].
sum
(
dim
=
1
)
iccs
=
moves
[
preds
.
argmax
()]
move
=
self
.
board
.
move_iccs
(
iccs
)
self
.
record
(
iccs
)
if
self
.
board
.
is_win
():
print
(
u
'机器赢了'
)
break
# 人类走棋
iccs
,
move
=
self
.
human_input
()
self
.
record
(
iccs
)
if
self
.
board
.
is_win
():
print
(
u
'人类赢了'
)
break
chessplayer
=
ChessPlayer
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存模型
# model.save_weights('./best_model_chess.pt')
pass
if
__name__
==
'__main__'
:
choice
=
'eval'
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
1000
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model_chess.pt'
)
chessplayer
.
new_game
(
0
)
# 启动新棋局,0为人类先手,1为机器先手
examples/others/task_nl2sql_baseline.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 追一科技2019年NL2SQL挑战赛的一个Baseline(个人作品,非官方发布,基于Bert)
# 比赛地址:https://tianchi.aliyun.com/competition/entrance/231716/introduction
# 科学空间:https://kexue.fm/archives/6771
# 苏神结果是58%左右,我复现出来58.39%
# 思路:[CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
# 整句的[CLS]用来做conds连接符判断: {0:"", 1:"and", 2:"or"}
# col的[CLS]用来预测该列是否被select+agg聚合判断: {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
''' 单条样本示例
{
"table_id": "a1b2c3d4", # 相应表格的id
"question": "世茂茂悦府新盘容积率大于1,请问它的套均面积是多少?", # 自然语言问句
"sql":{ # 真实SQL
"sel": [7], # SQL选择的列
"agg": [0], # 选择的列相应的聚合函数, '0'代表无
"cond_conn_op": 0, # 条件之间的关系
"conds": [
[1, 2, "世茂茂悦府"], # 条件列, 条件类型, 条件值,col_1 == "世茂茂悦府"
[6, 0, "1"]
]
}
}
'''
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
import
json
import
codecs
import
numpy
as
np
from
tqdm
import
tqdm
import
jieba
import
editdistance
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
import
torch.nn.functional
as
F
from
torch
import
nn
,
optim
import
re
batch_size
=
16
maxlen
=
160
num_agg
=
7
# agg_sql_dict = {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
num_op
=
5
# {0:">", 1:"<", 2:"==", 3:"!=", 4:"不被select"}
num_cond_conn_op
=
3
# conn_sql_dict = {0:"", 1:"and", 2:"or"}
learning_rate
=
2.5e-5
epochs
=
15
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
def
read_data
(
data_file
,
table_file
):
data
,
tables
=
[],
{}
with
open
(
data_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
data
.
append
(
json
.
loads
(
l
))
with
open
(
table_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
d
=
{}
d
[
'headers'
]
=
l
[
'header'
]
d
[
'header2id'
]
=
{
j
:
i
for
i
,
j
in
enumerate
(
d
[
'headers'
])}
d
[
'content'
]
=
{}
d
[
'all_values'
]
=
set
()
rows
=
np
.
array
(
l
[
'rows'
])
for
i
,
h
in
enumerate
(
d
[
'headers'
]):
d
[
'content'
][
h
]
=
set
(
rows
[:,
i
])
d
[
'all_values'
].
update
(
d
[
'content'
][
h
])
d
[
'all_values'
]
=
set
([
i
for
i
in
d
[
'all_values'
]
if
hasattr
(
i
,
'__len__'
)])
tables
[
l
[
'id'
]]
=
d
return
data
,
tables
token_dict
=
{}
with
codecs
.
open
(
dict_path
,
'r'
,
'utf8'
)
as
reader
:
for
line
in
reader
:
token
=
line
.
strip
()
token_dict
[
token
]
=
len
(
token_dict
)
class
OurTokenizer
(
Tokenizer
):
def
_tokenize
(
self
,
text
):
R
=
[]
for
c
in
text
:
if
c
in
self
.
_token_dict
:
R
.
append
(
c
)
elif
self
.
_is_space
(
c
):
R
.
append
(
'[unused1]'
)
# space类用未经训练的[unused1]表示
else
:
R
.
append
(
'[UNK]'
)
# 剩余的字符是[UNK]
return
R
tokenizer
=
OurTokenizer
(
token_dict
)
def
most_similar
(
s
,
slist
):
"""从词表中找最相近的词(当无法全匹配的时候)
"""
if
len
(
slist
)
==
0
:
return
s
scores
=
[
editdistance
.
eval
(
s
,
t
)
for
t
in
slist
]
return
slist
[
np
.
argmin
(
scores
)]
def
most_similar_2
(
w
,
s
):
"""从句子s中找与w最相近的片段,
借助分词工具和ngram的方式尽量精确地确定边界。
"""
sw
=
jieba
.
lcut
(
s
)
sl
=
list
(
sw
)
sl
.
extend
([
''
.
join
(
i
)
for
i
in
zip
(
sw
,
sw
[
1
:])])
sl
.
extend
([
''
.
join
(
i
)
for
i
in
zip
(
sw
,
sw
[
1
:],
sw
[
2
:])])
return
most_similar
(
w
,
sl
)
class
MyDataset
(
Dataset
):
def
__init__
(
self
,
data
,
tables
):
self
.
data
=
data
self
.
tables
=
tables
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
i
):
d
=
self
.
data
[
i
]
# [CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
x1
=
tokenizer
.
encode
(
d
[
'question'
])[
0
]
xm
=
[
0
]
+
[
1
]
*
len
(
d
[
'question'
])
+
[
0
]
h
=
[]
for
j
in
self
.
tables
[
d
[
'table_id'
]][
'headers'
]:
_x1
=
tokenizer
.
encode
(
j
)[
0
]
h
.
append
(
len
(
x1
))
x1
.
extend
(
_x1
)
if
len
(
x1
)
>
maxlen
:
return
hm
=
[
1
]
*
len
(
h
)
# 列的mask
# 列是否被选择
sel
=
[]
for
j
in
range
(
len
(
h
)):
if
j
in
d
[
'sql'
][
'sel'
]:
j
=
d
[
'sql'
][
'sel'
].
index
(
j
)
sel
.
append
(
d
[
'sql'
][
'agg'
][
j
])
else
:
sel
.
append
(
num_agg
-
1
)
# 不被select则被标记为num_agg-1
conn
=
[
d
[
'sql'
][
'cond_conn_op'
]]
csel
=
np
.
zeros
(
len
(
d
[
'question'
])
+
2
,
dtype
=
'int32'
)
# 这里的0既表示padding,又表示第一列,padding部分训练时会被mask
cop
=
np
.
zeros
(
len
(
d
[
'question'
])
+
2
,
dtype
=
'int32'
)
+
num_op
-
1
# 不被select则被标记为num_op-1
for
j
in
d
[
'sql'
][
'conds'
]:
if
j
[
2
]
not
in
d
[
'question'
]:
j
[
2
]
=
most_similar_2
(
j
[
2
],
d
[
'question'
])
if
j
[
2
]
not
in
d
[
'question'
]:
continue
k
=
d
[
'question'
].
index
(
j
[
2
])
csel
[
k
+
1
:
k
+
1
+
len
(
j
[
2
])]
=
j
[
0
]
cop
[
k
+
1
:
k
+
1
+
len
(
j
[
2
])]
=
j
[
1
]
# x1: bert的输入 [101, 123, 121, 122, 123, 2399, 122, 118, 126, 3299, 5168, 6369, 2832, 6598, ...]
# xm: bert输入mask [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]
# h: 列名[CLS]所在位置 [56, 60, 74, 89, 104, 114, 123, 132]
# hm: 列名mask [1, 1, 1, 1, 1, 1, 1, 1]
# sel: 被select查找的列 [4, 6, 6, 6, 6, 6, 6, 6], 6表示列未被select,4表示COUNT
# conn: 连接类型 [1], 1表示and
# csel: 条件中的列 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# cop: 条件中的运算符(同时也是值的标记) [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
return
x1
,
xm
,
h
,
hm
,
sel
,
conn
,
csel
,
cop
def
collate_fn
(
batch
):
x1
,
xm
,
h
,
hm
,
sel
,
conn
,
csel
,
cop
=
zip
(
*
[
i
for
i
in
batch
if
i
])
x1
=
torch
.
tensor
(
sequence_padding
(
x1
),
dtype
=
torch
.
long
,
device
=
device
)
xm
=
torch
.
tensor
(
sequence_padding
(
xm
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
h
=
torch
.
tensor
(
sequence_padding
(
h
),
dtype
=
torch
.
long
,
device
=
device
)
hm
=
torch
.
tensor
(
sequence_padding
(
hm
),
dtype
=
torch
.
long
,
device
=
device
)
sel
=
torch
.
tensor
(
sequence_padding
(
sel
),
dtype
=
torch
.
long
,
device
=
device
)
conn
=
torch
.
tensor
(
sequence_padding
(
conn
),
dtype
=
torch
.
long
,
device
=
device
)
csel
=
torch
.
tensor
(
sequence_padding
(
csel
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
cop
=
torch
.
tensor
(
sequence_padding
(
cop
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
return
[
x1
,
h
,
hm
],
[
sel
,
conn
,
csel
,
cop
,
xm
,
hm
]
datadir
=
'F:/Projects/data/corpus/other/ZhuiyiTechnology_NL2SQL'
train_dataloader
=
DataLoader
(
MyDataset
(
*
read_data
(
f
'
{
datadir
}
/train/train.json'
,
f
'
{
datadir
}
/train/train.tables.json'
)),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_data
,
valid_table
=
read_data
(
f
'
{
datadir
}
/val/val.json'
,
f
'
{
datadir
}
/val/val.tables.json'
)
test_data
,
test_table
=
read_data
(
f
'
{
datadir
}
/test/test.json'
,
f
'
{
datadir
}
/test/test.tables.json'
)
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
hidden_size
=
self
.
bert
.
configs
[
'hidden_size'
]
self
.
conn
=
nn
.
Linear
(
hidden_size
,
num_cond_conn_op
)
self
.
agg
=
nn
.
Linear
(
hidden_size
,
num_agg
)
self
.
op
=
nn
.
Linear
(
hidden_size
,
num_op
)
self
.
dense1
=
nn
.
Linear
(
hidden_size
,
256
)
self
.
dense2
=
nn
.
Linear
(
hidden_size
,
256
)
self
.
dense3
=
nn
.
Linear
(
256
,
1
)
def
forward
(
self
,
x1_in
,
h
,
hm
):
x
=
self
.
bert
([
x1_in
])
# cls判断条件连接符 {0:"", 1:"and", 2:"or"}
x4conn
=
x
[:,
0
]
# [cls位]
pconn
=
self
.
conn
(
x4conn
)
# [btz, num_cond_conn_op]
# 列的cls位用来判断列名的agg和是否被select {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
x4h
=
torch
.
gather
(
x
,
dim
=
1
,
index
=
h
.
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
768
))
# [btz, col_len, hdsz]
psel
=
self
.
agg
(
x4h
)
# [btz, col_len, num_agg]
# 序列标注conds的值和运算符
pcop
=
self
.
op
(
x
)
# [btz, seq_len, num_op]
x
=
x
.
unsqueeze
(
2
)
# [btz, seq_len, 1, hdsz]
x4h
=
x4h
.
unsqueeze
(
1
)
# [btz, 1, col_len, hdsz]
pcsel_1
=
self
.
dense1
(
x
)
# [btz, seq_len, 1, 256]
pcsel_2
=
self
.
dense2
(
x4h
)
# [btz, 1, col_len, 256]
pcsel
=
pcsel_1
+
pcsel_2
pcsel
=
torch
.
tanh
(
pcsel
)
pcsel
=
self
.
dense3
(
pcsel
)
# [btz, seq_len, col_len, 1]
pcsel
=
pcsel
[...,
0
]
-
(
1
-
hm
[:,
None
])
*
1e10
# [btz, seq_len, col_len]
return
pconn
,
psel
,
pcop
,
pcsel
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
pconn
,
psel
,
pcop
,
pcsel
=
outputs
sel_in
,
conn_in
,
csel_in
,
cop_in
,
xm
,
hm
=
labels
cm
=
torch
.
not_equal
(
cop_in
,
num_op
-
1
)
batch_size
=
psel
.
shape
[
0
]
psel_loss
=
F
.
cross_entropy
(
psel
.
view
(
-
1
,
num_agg
),
sel_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
psel_loss
=
torch
.
sum
(
psel_loss
*
hm
)
/
torch
.
sum
(
hm
)
pconn_loss
=
F
.
cross_entropy
(
pconn
,
conn_in
.
view
(
-
1
))
pcop_loss
=
F
.
cross_entropy
(
pcop
.
view
(
-
1
,
num_op
),
cop_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
pcop_loss
=
torch
.
sum
(
pcop_loss
*
xm
)
/
torch
.
sum
(
xm
)
pcsel_loss
=
F
.
cross_entropy
(
pcsel
.
view
(
-
1
,
pcsel
.
shape
[
-
1
]),
csel_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
pcsel_loss
=
torch
.
sum
(
pcsel_loss
*
xm
*
cm
)
/
torch
.
sum
(
xm
*
cm
)
loss
=
psel_loss
+
pconn_loss
+
pcop_loss
+
pcsel_loss
return
{
'loss'
:
loss
,
'psel_loss'
:
psel_loss
,
'pconn_loss'
:
pconn_loss
,
'pcop_loss'
:
pcop_loss
,
'pcsel_loss'
:
pcsel_loss
}
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
learning_rate
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
len
(
train_dataloader
),
len
(
train_dataloader
)
*
epochs
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optimizer
,
scheduler
=
scheduler
)
def
nl2sql
(
question
,
table
):
"""输入question和headers,转SQL
"""
x1
=
tokenizer
.
encode
(
question
)[
0
]
h
=
[]
for
i
in
table
[
'headers'
]:
_x1
=
tokenizer
.
encode
(
i
)[
0
]
h
.
append
(
len
(
x1
))
x1
.
extend
(
_x1
)
hm
=
[
1
]
*
len
(
h
)
pconn
,
psel
,
pcop
,
pcsel
=
model
.
predict
([
torch
.
tensor
([
x1
],
dtype
=
torch
.
long
,
device
=
device
),
torch
.
tensor
([
h
],
dtype
=
torch
.
long
,
device
=
device
),
torch
.
tensor
([
hm
],
dtype
=
torch
.
long
,
device
=
device
)
])
pconn
,
psel
,
pcop
,
pcsel
=
pconn
.
cpu
().
numpy
(),
psel
.
cpu
().
numpy
(),
pcop
.
cpu
().
numpy
(),
pcsel
.
cpu
().
numpy
()
R
=
{
'agg'
:
[],
'sel'
:
[]}
for
i
,
j
in
enumerate
(
psel
[
0
].
argmax
(
1
)):
if
j
!=
num_agg
-
1
:
# num_agg-1类是不被select的意思
R
[
'sel'
].
append
(
i
)
R
[
'agg'
].
append
(
int
(
j
))
conds
=
[]
v_op
=
-
1
for
i
,
j
in
enumerate
(
pcop
[
0
,
:
len
(
question
)
+
1
].
argmax
(
1
)):
# 这里结合标注和分类来预测条件
if
j
!=
num_op
-
1
:
if
v_op
!=
j
:
if
v_op
!=
-
1
:
v_end
=
v_start
+
len
(
v_str
)
csel
=
pcsel
[
0
][
v_start
:
v_end
].
mean
(
0
).
argmax
()
conds
.
append
((
csel
,
v_op
,
v_str
))
v_start
=
i
v_op
=
j
v_str
=
question
[
i
-
1
]
else
:
v_str
+=
question
[
i
-
1
]
elif
v_op
!=
-
1
:
v_end
=
v_start
+
len
(
v_str
)
csel
=
pcsel
[
0
][
v_start
:
v_end
].
mean
(
0
).
argmax
()
conds
.
append
((
csel
,
v_op
,
v_str
))
v_op
=
-
1
R
[
'conds'
]
=
set
()
for
i
,
j
,
k
in
conds
:
if
re
.
findall
(
'[^\d\.]'
,
k
):
j
=
2
# 非数字只能用等号
if
j
==
2
:
if
k
not
in
table
[
'all_values'
]:
# 等号的值必须在table出现过,否则找一个最相近的
k
=
most_similar
(
k
,
list
(
table
[
'all_values'
]))
h
=
table
[
'headers'
][
i
]
# 然后检查值对应的列是否正确,如果不正确,直接修正列名
if
k
not
in
table
[
'content'
][
h
]:
for
r
,
v
in
table
[
'content'
].
items
():
if
k
in
v
:
i
=
table
[
'header2id'
][
r
]
break
R
[
'conds'
].
add
((
int
(
i
),
int
(
j
),
str
(
k
)))
R
[
'conds'
]
=
list
(
R
[
'conds'
])
if
len
(
R
[
'conds'
])
<=
1
:
# 条件数少于等于1时,条件连接符直接为0
R
[
'cond_conn_op'
]
=
0
else
:
R
[
'cond_conn_op'
]
=
1
+
int
(
pconn
[
0
,
1
:].
argmax
())
# 不能是0
return
R
def
is_equal
(
R1
,
R2
):
"""判断两个SQL字典是否全匹配
"""
return
(
R1
[
'cond_conn_op'
]
==
R2
[
'cond_conn_op'
])
&
\
(
set
(
zip
(
R1
[
'sel'
],
R1
[
'agg'
]))
==
set
(
zip
(
R2
[
'sel'
],
R2
[
'agg'
])))
&
\
(
set
([
tuple
(
i
)
for
i
in
R1
[
'conds'
]])
==
set
([
tuple
(
i
)
for
i
in
R2
[
'conds'
]]))
class
Evaluate
(
Callback
):
def
__init__
(
self
):
self
.
accs
=
[]
self
.
best
=
0.
self
.
passed
=
0
self
.
stage
=
0
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
acc
=
self
.
evaluate
(
valid_data
,
valid_table
)
self
.
accs
.
append
(
acc
)
if
acc
>
self
.
best
:
self
.
best
=
acc
# model.save_weights('best_model.weights')
print
(
'acc: %.5f, best acc: %.5f
\n
'
%
(
acc
,
self
.
best
))
def
evaluate
(
self
,
data
,
tables
):
right
=
0.
pbar
=
tqdm
()
F
=
open
(
'evaluate_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
for
i
,
d
in
enumerate
(
data
):
question
=
d
[
'question'
]
table
=
tables
[
d
[
'table_id'
]]
R
=
nl2sql
(
question
,
table
)
right
+=
float
(
is_equal
(
R
,
d
[
'sql'
]))
pbar
.
update
(
1
)
pbar
.
set_description
(
'< acc: %.5f >'
%
(
right
/
(
i
+
1
)))
d
[
'sql_pred'
]
=
R
try
:
s
=
json
.
dumps
(
d
,
ensure_ascii
=
False
,
indent
=
4
)
except
:
continue
F
.
write
(
s
+
'
\n
'
)
F
.
close
()
pbar
.
close
()
return
right
/
len
(
data
)
def
test
(
self
,
data
,
tables
,
outfile
=
'result.json'
):
pbar
=
tqdm
()
F
=
open
(
outfile
,
'w'
)
for
i
,
d
in
enumerate
(
data
):
question
=
d
[
'question'
]
table
=
tables
[
d
[
'table_id'
]]
R
=
nl2sql
(
question
,
table
)
pbar
.
update
(
1
)
s
=
json
.
dumps
(
R
,
ensure_ascii
=
False
)
F
.
write
(
s
.
encode
(
'utf-8'
)
+
'
\n
'
)
F
.
close
()
pbar
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluate
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.weights'
)
\ No newline at end of file
examples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 预训练脚本,单GPU版方便测试
# 改DDP需几行代码,参考https://github.com/Tongjilibo/bert4torch/blob/master/examples/training_trick/task_distributed_data_parallel.py
from
bert4torch.models
import
build_transformer_model
from
bert4torch.snippets
import
sequence_padding
,
Callback
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
from
torch.utils.data
import
Dataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
json
import
os
import
shelve
import
random
import
time
# 语料路径和模型保存路径
model_saved_path
=
'./bert_model.ckpt'
dir_training_data
=
'E:/Github/bert4torch/examples/datasets/pretrain'
# dir_training_data
task_name
=
'roberta'
# 其他配置
maxlen
=
512
batch_size
=
7
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
# 如果从零训练,就设为None
learning_rate
=
0.00176
weight_decay_rate
=
0.01
# 权重衰减
num_warmup_steps
=
3125
num_train_steps
=
125000
steps_per_epoch
=
10000
grad_accum_steps
=
16
# 大于1即表明使用梯度累积
epochs
=
num_train_steps
*
grad_accum_steps
//
steps_per_epoch
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 读取数据集,构建数据张量
class
MyDataset
(
Dataset
):
def
__init__
(
self
,
file
):
super
(
MyDataset
,
self
).
__init__
()
self
.
file
=
file
self
.
len
=
self
.
_get_dataset_length
()
self
.
db
=
self
.
_load_data
()
def
__getitem__
(
self
,
index
):
return
self
.
db
[
str
(
index
)]
def
__len__
(
self
):
return
self
.
len
def
_get_dataset_length
(
self
):
file_record_info
=
self
.
file
+
".json"
record_info
=
json
.
load
(
open
(
file_record_info
,
"r"
,
encoding
=
"utf-8"
))
return
record_info
[
"samples_num"
]
def
_load_data
(
self
):
return
shelve
.
open
(
self
.
file
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
item
in
batch
:
batch_token_ids
.
append
(
item
[
'input_ids'
])
batch_labels
.
append
(
item
[
'masked_lm_labels'
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
batch_labels
# 从语料文件夹中随机选取一个文件,生成dataloader
def
get_train_dataloader
():
while
True
:
# prepare dataset
files_training_data
=
os
.
listdir
(
dir_training_data
)
files_training_data
=
[
file
.
split
(
"."
)[
0
]
for
file
in
files_training_data
if
"train"
in
file
]
# 防止使用到正在生成的文件
files_training_data
=
[
i
for
i
in
set
(
files_training_data
)
if
files_training_data
.
count
(
i
)
==
4
]
if
files_training_data
:
file_train
=
random
.
choice
(
files_training_data
)
for
suffix
in
[
".bak"
,
".dat"
,
".dir"
,
".json"
]:
file_old
=
os
.
path
.
join
(
dir_training_data
,
file_train
+
suffix
)
file_new
=
os
.
path
.
join
(
dir_training_data
,
task_name
+
suffix
)
os
.
renames
(
file_old
,
file_new
)
cur_load_file
=
file_new
.
split
(
"."
)[
0
]
train_dataloader
=
DataLoader
(
MyDataset
(
cur_load_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
break
else
:
print
(
"No training data! Sleep 300s!"
)
time
.
sleep
(
10
)
continue
return
train_dataloader
train_dataloader
=
get_train_dataloader
()
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_mlm
=
True
).
to
(
device
)
# weight decay
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
weight_decay_rate
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
output
,
batch_labels
):
y_preds
=
output
[
-
1
]
y_preds
=
y_preds
.
reshape
(
-
1
,
y_preds
.
shape
[
-
1
])
return
super
().
forward
(
y_preds
,
batch_labels
.
flatten
())
# 定义使用的loss和optimizer,这里支持自定义
optimizer
=
optim
.
Adam
(
optimizer_grouped_parameters
,
lr
=
learning_rate
,
weight_decay
=
weight_decay_rate
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
num_warmup_steps
,
num_training_steps
=
num_train_steps
)
model
.
compile
(
loss
=
MyLoss
(
ignore_index
=
0
),
optimizer
=
optimizer
,
scheduler
=
scheduler
)
class
ModelCheckpoint
(
Callback
):
"""自动保存最新模型
"""
def
on_dataloader_end
(
self
,
logs
=
None
):
# 在dataloader结束的时候,关闭db并且删除训练的文件
model
.
train_dataloader
.
dataset
.
db
.
close
()
for
suffix
in
[
".bak"
,
".dat"
,
".dir"
,
".json"
]:
file_remove
=
os
.
path
.
join
(
dir_training_data
,
task_name
+
suffix
)
try
:
os
.
remove
(
file_remove
)
except
:
print
(
f
"Failed to remove training data
{
file_remove
}
."
)
# 重新生成dataloader
model
.
train_dataloader
=
get_train_dataloader
()
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
model
.
save_weights
(
model_saved_path
)
if
__name__
==
'__main__'
:
# 保存模型
checkpoint
=
ModelCheckpoint
()
# 模型训练
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
grad_accumulation_steps
=
grad_accum_steps
,
epochs
=
epochs
,
callbacks
=
[
checkpoint
],
)
examples/pretrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# 预训练语料构建,这里实现的mlm任务的,NSP和SOP未使用
# 方案:一直动态生成文件,超过最大保存数目时候sleep,
# 当训练速度超过文件生成速度时候,可开启多个数据生成脚本
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
import
json
,
glob
,
re
from
tqdm
import
tqdm
import
collections
import
gc
import
shelve
import
time
import
os
import
random
import
jieba
jieba
.
initialize
()
class
TrainingDataset
(
object
):
"""预训练数据集生成器
"""
def
__init__
(
self
,
tokenizer
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self
.
tokenizer
=
tokenizer
self
.
sequence_length
=
sequence_length
self
.
token_pad_id
=
tokenizer
.
_token_pad_id
self
.
token_cls_id
=
tokenizer
.
_token_start_id
self
.
token_sep_id
=
tokenizer
.
_token_end_id
self
.
token_mask_id
=
tokenizer
.
_token_mask_id
self
.
vocab_size
=
tokenizer
.
_vocab_size
def
padding
(
self
,
sequence
,
padding_value
=
None
):
"""对单个序列进行补0
"""
if
padding_value
is
None
:
padding_value
=
self
.
token_pad_id
sequence
=
sequence
[:
self
.
sequence_length
]
padding_length
=
self
.
sequence_length
-
len
(
sequence
)
return
sequence
+
[
padding_value
]
*
padding_length
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数,返回处理后的instance
"""
raise
NotImplementedError
def
paragraph_process
(
self
,
texts
,
starts
,
ends
,
paddings
):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances
,
instance
=
[],
[[
start
]
for
start
in
starts
]
for
text
in
texts
:
# 处理单个句子
sub_instance
=
self
.
sentence_process
(
text
)
sub_instance
=
[
i
[:
self
.
sequence_length
-
2
]
for
i
in
sub_instance
]
new_length
=
len
(
instance
[
0
])
+
len
(
sub_instance
[
0
])
# 如果长度即将溢出
if
new_length
>
self
.
sequence_length
-
1
:
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储结果,并构建新样本
instances
.
append
(
complete_instance
)
instance
=
[[
start
]
for
start
in
starts
]
# 样本续接
for
item
,
sub_item
in
zip
(
instance
,
sub_instance
):
item
.
extend
(
sub_item
)
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储最后的instance
instances
.
append
(
complete_instance
)
return
instances
def
serialize
(
self
,
instances
,
db
,
count
):
"""写入到文件
"""
for
instance
in
instances
:
input_ids
,
masked_lm_labels
=
instance
[
0
],
instance
[
1
]
assert
len
(
input_ids
)
<=
sequence_length
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
input_ids
features
[
"masked_lm_labels"
]
=
masked_lm_labels
db
[
str
(
count
)]
=
features
count
+=
1
return
count
def
process
(
self
,
corpus
,
record_name
):
"""处理输入语料(corpus)
"""
count
=
0
db
=
shelve
.
open
(
record_name
)
for
texts
in
corpus
:
instances
=
self
.
paragraph_process
(
texts
)
count
=
self
.
serialize
(
instances
,
db
,
count
)
db
.
close
()
del
instances
gc
.
collect
()
# 记录对应的文件名和样本量
record_info
=
{
"filename"
:
record_name
,
"samples_num"
:
count
}
json
.
dump
(
record_info
,
open
(
record_name
+
".json"
,
"w"
,
encoding
=
"utf-8"
))
print
(
'write %s examples into %s'
%
(
count
,
record_name
))
class
TrainingDatasetRoBERTa
(
TrainingDataset
):
"""预训练数据集生成器(RoBERTa模式)
"""
def
__init__
(
self
,
tokenizer
,
word_segment
,
mask_rate
=
0.15
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4torch自带的tokenizer类;
word_segment是任意分词函数。
"""
super
(
TrainingDatasetRoBERTa
,
self
).
__init__
(
tokenizer
,
sequence_length
)
self
.
word_segment
=
word_segment
self
.
mask_rate
=
mask_rate
def
token_process
(
self
,
token_id
):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand
=
np
.
random
.
random
()
if
rand
<=
0.8
:
return
self
.
token_mask_id
elif
rand
<=
0.9
:
return
token_id
else
:
return
np
.
random
.
randint
(
0
,
self
.
vocab_size
)
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列, 来指定哪些token是否要被mask
"""
words
=
self
.
word_segment
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
token_ids
,
mask_ids
=
[],
[]
for
rand
,
word
in
zip
(
rands
,
words
):
word_tokens
=
self
.
tokenizer
.
tokenize
(
text
=
word
)[
1
:
-
1
]
word_token_ids
=
self
.
tokenizer
.
tokens_to_ids
(
word_tokens
)
if
rand
<
self
.
mask_rate
:
word_mask_ids
=
[
self
.
token_process
(
i
)
for
i
in
word_token_ids
]
token_ids
.
extend
(
word_mask_ids
)
mask_ids
.
extend
(
word_token_ids
)
else
:
token_ids
.
extend
(
word_token_ids
)
word_mask_ids
=
[
0
]
*
len
(
word_tokens
)
mask_ids
.
extend
(
word_mask_ids
)
return
[
token_ids
,
mask_ids
]
def
paragraph_process
(
self
,
texts
):
"""给原方法补上starts、ends、paddings
"""
starts
=
[
self
.
token_cls_id
,
0
]
ends
=
[
self
.
token_sep_id
,
0
]
paddings
=
[
self
.
token_pad_id
,
0
]
return
super
(
TrainingDatasetRoBERTa
,
self
).
paragraph_process
(
texts
,
starts
,
ends
,
paddings
)
if
__name__
==
'__main__'
:
sequence_length
=
512
# 文本长度
max_file_num
=
40
# 最大保存的文件个数
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
# 字典文件
dir_training_data
=
'E:/Github/bert4torch/examples/datasets/pretrain'
# 保存的文件目录
dir_corpus
=
'F:/Projects/data/corpus/pretrain'
# 读入的语料地址
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
some_texts
():
'''挑选语料
'''
files_corpus
=
glob
.
glob
(
f
'
{
dir_corpus
}
/*/*'
)
# 根据目录结构自行调整
file_corpus
=
random
.
choice
(
files_corpus
)
# 随机挑选一篇文章
count
,
texts
=
0
,
[]
with
open
(
file_corpus
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
,
desc
=
f
'Load data from
{
file_corpus
}
'
):
l
=
l
.
strip
()
texts
.
extend
(
re
.
findall
(
u
'.*?[
\n
。]+'
,
l
))
count
+=
1
if
count
==
10
:
# 10篇文章合在一起再处理
yield
texts
count
,
texts
=
0
,
[]
if
texts
:
yield
texts
def
word_segment
(
text
):
return
jieba
.
lcut
(
text
)
TD
=
TrainingDatasetRoBERTa
(
tokenizer
,
word_segment
,
sequence_length
=
sequence_length
)
while
True
:
train_files
=
[
file
for
file
in
os
.
listdir
(
dir_training_data
)
if
(
'train_'
in
file
)
and
(
'dat'
in
file
)]
# 当保存的训练文件未达到指定数量时
if
len
(
train_files
)
<
max_file_num
:
record_name
=
f
'
{
dir_training_data
}
/train_'
+
time
.
strftime
(
'%Y%m%d%H%M%S'
,
time
.
localtime
())
TD
.
process
(
corpus
=
some_texts
(),
record_name
=
record_name
)
time
.
sleep
(
1
)
# 可不加,这里是防止生成文件名一样
else
:
time
.
sleep
(
300
)
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage1,训练方式和simbert类似+[MASK预测]
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
AutoRegressiveDecoder
from
bert4torch.snippets
import
Callback
,
truncate_sequences
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
import
jieba
jieba
.
initialize
()
# 基本信息
maxlen
=
64
batch_size
=
12
# bert配置,加载roformer权重
config_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 这里语料没有官方的丰富,可用自定义预料
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
masked_encode
(
text
):
"""wwm随机mask
"""
words
=
jieba
.
lcut
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
source
,
target
=
[
tokenizer
.
_token_start_id
],
[
0
]
for
r
,
w
in
zip
(
rands
,
words
):
ids
=
tokenizer
.
encode
(
w
)[
0
][
1
:
-
1
]
if
r
<
0.15
*
0.8
:
source
.
extend
([
tokenizer
.
_token_mask_id
]
*
len
(
ids
))
target
.
extend
(
ids
)
elif
r
<
0.15
*
0.9
:
source
.
extend
(
ids
)
target
.
extend
(
ids
)
elif
r
<
0.15
:
source
.
extend
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
,
size
=
len
(
ids
))
+
1
)
target
.
extend
(
ids
)
else
:
source
.
extend
(
ids
)
target
.
extend
([
0
]
*
len
(
ids
))
source
=
source
[:
maxlen
-
1
]
+
[
tokenizer
.
_token_end_id
]
target
=
target
[:
maxlen
-
1
]
+
[
0
]
return
source
,
target
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
for
_
in
range
(
2
):
text
,
synonym
=
synonyms
[:
2
]
if
np
.
random
.
random
()
<
0.5
:
text_ids
=
masked_encode
(
text
)[
0
]
else
:
text_ids
=
tokenizer
.
encode
(
text
)[
0
]
synonym_ids
=
tokenizer
.
encode
(
synonym
)[
0
][
1
:]
truncate_sequences
(
maxlen
*
2
,
-
2
,
text_ids
,
synonym_ids
)
token_ids
=
text_ids
+
synonym_ids
segment_ids
=
[
0
]
*
len
(
text_ids
)
+
[
1
]
*
len
(
synonym_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
text
,
synonym
=
synonym
,
text
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
with_mlm
=
True
,
dropout_rate
=
0.2
,
application
=
'unilm'
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
):
y_true
=
self
.
get_labels_of_similarity
(
y_pred
)
# 构建标签
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
similarities
=
similarities
-
torch
.
eye
(
y_pred
.
shape
[
0
],
device
=
device
)
*
1e12
# 排除对角线
similarities
=
similarities
*
30
# scale
loss
=
F
.
cross_entropy
(
similarities
,
y_true
)
return
loss
def
get_labels_of_similarity
(
self
,
y_pred
):
idxs
=
torch
.
arange
(
0
,
y_pred
.
shape
[
0
],
device
=
device
)
idxs_1
=
idxs
[
None
,
:]
idxs_2
=
(
idxs
+
1
-
idxs
%
2
*
2
)[:,
None
]
labels
=
idxs_1
.
eq
(
idxs_2
).
float
()
return
labels
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage2,把simbert的相似度蒸馏到roformer-sim上
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
get_pool_emb
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
truncate_sequences
from
bert4torch.tokenizers
import
Tokenizer
import
jieba
jieba
.
initialize
()
# 基本信息
maxlen
=
64
batch_size
=
12
# bert配置,需要加载stage1训练后的权重,这里直接加载官方最终的权重以示例
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 这里语料和stage1保持一致
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
masked_encode
(
text
):
"""wwm随机mask
"""
words
=
jieba
.
lcut
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
source
,
target
=
[
tokenizer
.
_token_start_id
],
[
0
]
for
r
,
w
in
zip
(
rands
,
words
):
ids
=
tokenizer
.
encode
(
w
)[
0
][
1
:
-
1
]
if
r
<
0.15
*
0.8
:
source
.
extend
([
tokenizer
.
_token_mask_id
]
*
len
(
ids
))
target
.
extend
(
ids
)
elif
r
<
0.15
*
0.9
:
source
.
extend
(
ids
)
target
.
extend
(
ids
)
elif
r
<
0.15
:
source
.
extend
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
,
size
=
len
(
ids
))
+
1
)
target
.
extend
(
ids
)
else
:
source
.
extend
(
ids
)
target
.
extend
([
0
]
*
len
(
ids
))
source
=
source
[:
maxlen
-
1
]
+
[
tokenizer
.
_token_end_id
]
target
=
target
[:
maxlen
-
1
]
+
[
0
]
return
source
,
target
# ========== 蒸馏用:开始 ==========
# simbert配置
sim_config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
sim_checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
sim_dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
# 建立分词器
sim_tokenizer
=
Tokenizer
(
sim_dict_path
,
do_lower_case
=
True
)
# 建立分词器
# 建立加载模型
simbert
=
build_transformer_model
(
sim_config_path
,
sim_checkpoint_path
,
with_pool
=
'linear'
,
application
=
'unilm'
).
to
(
device
)
# ========== 蒸馏用:结束 ==========
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_sim_token_ids
,
batch_sim_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
for
_
in
range
(
2
):
text
,
synonym
=
synonyms
[:
2
]
if
np
.
random
.
random
()
<
0.5
:
text_ids
=
masked_encode
(
text
)[
0
]
else
:
text_ids
=
tokenizer
.
encode
(
text
)[
0
]
synonym_ids
=
tokenizer
.
encode
(
synonym
)[
0
][
1
:]
truncate_sequences
(
maxlen
*
2
,
-
2
,
text_ids
,
synonym_ids
)
token_ids
=
text_ids
+
synonym_ids
segment_ids
=
[
0
]
*
len
(
text_ids
)
+
[
1
]
*
len
(
synonym_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
# ==== 蒸馏用:开始 ====
token_ids
,
segment_ids
=
sim_tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_sim_token_ids
.
append
(
token_ids
)
batch_sim_segment_ids
.
append
(
segment_ids
)
# ==== 蒸馏用:结束 ====
text
,
synonym
=
synonym
,
text
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# ==== 蒸馏用:开始 ====
batch_sim_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_sim_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_sim_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_sim_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
sim_vecs
=
simbert
.
predict
([
batch_sim_token_ids
,
batch_sim_segment_ids
])[
1
]
sim_vecs
/=
(
sim_vecs
**
2
).
sum
(
dim
=-
1
,
keepdims
=
True
)
**
0.5
sims
=
torch
.
matmul
(
sim_vecs
,
sim_vecs
.
T
)
# ==== 蒸馏用:结束 ====
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
,
sims
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
with_mlm
=
True
,
dropout_rate
=
0.2
,
application
=
'unilm'
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
,
sims
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
,
sims
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
,
y_true
):
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
loss
=
100
*
torch
.
mean
((
similarities
-
y_true
)
**
2
)
return
loss
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
# SimBERT_v2监督训练代码supervised部分
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
from
bert4torch.snippets
import
Callback
,
truncate_sequences
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
import
json
import
glob
# 基本信息
maxlen
=
64
batch_size
=
12
labels
=
[
'contradiction'
,
'entailment'
,
'neutral'
]
# bert配置,需要加载stage2训练后的权重,这里直接加载官方最终的权重以示例
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
split
(
text
):
"""分割句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
*
1.2
,
seps
,
strips
)
class
MyDataset
(
ListDataset
):
def
load_data
(
self
,
file_path
):
dataset1_path
,
dataset2_path
=
file_path
D1
=
self
.
load_data_1
(
dataset1_path
)
D2
=
self
.
load_data_2
(
dataset2_path
)
return
D1
+
D2
@
staticmethod
def
load_data_1
(
filenames
,
threshold
=
0.5
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
l
[
0
],
l
[
1
]
=
split
(
l
[
0
])[
0
],
split
(
l
[
1
])[
0
]
D
.
append
((
l
[
0
],
l
[
1
],
int
(
float
(
l
[
2
])
>
threshold
)))
return
D
@
staticmethod
def
load_data_2
(
dir_path
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
glob
.
glob
(
dir_path
):
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'gold_label'
]
not
in
labels
:
continue
text1
=
split
(
l
[
'sentence1'
])[
0
]
text2
=
split
(
l
[
'sentence2'
])[
0
]
label
=
labels
.
index
(
l
[
'gold_label'
])
+
2
D
.
append
((
text1
,
text2
,
label
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
for
text
in
[
text1
,
text2
]:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
# 加载数据集
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
dataset1_path
=
[]
for
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]:
for
f
in
[
'train'
,
'valid'
]:
threshold
=
2.5
if
task_name
==
'STS-B'
else
0.5
filename
=
'%s%s/%s.%s.data'
%
(
data_path
,
task_name
,
task_name
,
f
)
dataset1_path
.
append
(
filename
)
dataset2_path
=
'F:/Projects/data/corpus/sentence_embedding/XNLI-MT-1.0/cnsd/cnsd-*/*.jsonl'
train_dataloader
=
DataLoader
(
MyDataset
([
dataset1_path
,
dataset2_path
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
dropout_rate
=
0.2
)
self
.
pool_method
=
pool_method
self
.
dense
=
nn
.
Linear
(
768
*
3
,
5
,
bias
=
False
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
# [btz*2, hdsz]
# 向量合并:a、b、|a-b|拼接
u
,
v
=
sen_emb
[::
2
],
sen_emb
[
1
::
2
]
sen_emb_concat
=
torch
.
cat
([
u
,
v
,
torch
.
abs
(
u
-
v
)],
dim
=-
1
)
# [btz, hdsz*3]
y_pred
=
self
.
dense
(
sen_emb_concat
)
# [btz, 5]
return
y_pred
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
MyLoss
(
nn
.
Module
):
"""loss分
"""
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
mask
=
torch
.
tensor
([
0
,
0
,
1
,
1
,
1
],
device
=
device
)
def
forward
(
self
,
y_pred
,
y_true
):
'''如果是两分类数据,则把后三位置-inf,如果是三分类数据,把前两位置-inf
'''
task
=
(
y_true
<
1.5
).
long
()
y_pred_1
=
y_pred
-
self
.
mask
*
1e12
y_pred_2
=
y_pred
-
(
1
-
self
.
mask
)
*
1e12
y_pred
=
task
*
y_pred_1
+
(
1
-
task
)
*
y_pred_2
return
F
.
cross_entropy
(
y_pred
,
y_true
.
flatten
())
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/relation_extraction/task_relation_extraction_CasRel.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于“半指针-半标注”结构
# 文章介绍:https://kexue.fm/archives/7161
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
import
numpy
as
np
from
bert4torch.layers
import
LayerNorm
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
,
Dataset
import
torch.optim
as
optim
import
torch.nn
as
nn
maxlen
=
128
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
collate_fn
(
batch
):
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_subject_labels
,
batch_subject_ids
,
batch_object_labels
=
[],
[],
[]
for
d
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
d
[
'text'
],
maxlen
=
maxlen
)
# 整理三元组 {s: [(o, p)]}
spoes
=
{}
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
s_idx
=
search
(
s
,
token_ids
)
o_idx
=
search
(
o
,
token_ids
)
if
s_idx
!=
-
1
and
o_idx
!=
-
1
:
s
=
(
s_idx
,
s_idx
+
len
(
s
)
-
1
)
o
=
(
o_idx
,
o_idx
+
len
(
o
)
-
1
,
p
)
if
s
not
in
spoes
:
spoes
[
s
]
=
[]
spoes
[
s
].
append
(
o
)
if
spoes
:
# subject标签
subject_labels
=
np
.
zeros
((
len
(
token_ids
),
2
))
for
s
in
spoes
:
subject_labels
[
s
[
0
],
0
]
=
1
# subject首
subject_labels
[
s
[
1
],
1
]
=
1
# subject尾
# 随机选一个subject(这里没有实现错误!这就是想要的效果!!)
# Todo: 感觉可以对未选到的subject加个mask,这样计算loss就不会计算到,可能因为模型对prob**n正例加权重导致影响不大
start
,
end
=
np
.
array
(
list
(
spoes
.
keys
())).
T
start
=
np
.
random
.
choice
(
start
)
end
=
np
.
random
.
choice
(
end
[
end
>=
start
])
subject_ids
=
(
start
,
end
)
# 对应的object标签
object_labels
=
np
.
zeros
((
len
(
token_ids
),
len
(
predicate2id
),
2
))
for
o
in
spoes
.
get
(
subject_ids
,
[]):
object_labels
[
o
[
0
],
o
[
2
],
0
]
=
1
object_labels
[
o
[
1
],
o
[
2
],
1
]
=
1
# 构建batch
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_subject_labels
.
append
(
subject_labels
)
batch_subject_ids
.
append
(
subject_ids
)
batch_object_labels
.
append
(
object_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_subject_labels
=
torch
.
tensor
(
sequence_padding
(
batch_subject_labels
),
dtype
=
torch
.
float
,
device
=
device
)
batch_subject_ids
=
torch
.
tensor
(
batch_subject_ids
,
dtype
=
torch
.
long
,
device
=
device
)
batch_object_labels
=
torch
.
tensor
(
sequence_padding
(
batch_object_labels
),
dtype
=
torch
.
float
,
device
=
device
)
batch_attention_mask
=
(
batch_token_ids
!=
tokenizer
.
_token_pad_id
)
return
[
batch_token_ids
,
batch_segment_ids
,
batch_subject_ids
],
[
batch_subject_labels
,
batch_object_labels
,
batch_attention_mask
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
)
self
.
linear1
=
nn
.
Linear
(
768
,
2
)
self
.
condLayerNorm
=
LayerNorm
(
hidden_size
=
768
,
conditional_size
=
768
*
2
)
self
.
linear2
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
2
)
@
staticmethod
def
extract_subject
(
inputs
):
"""根据subject_ids从output中取出subject的向量表征
"""
output
,
subject_ids
=
inputs
start
=
torch
.
gather
(
output
,
dim
=
1
,
index
=
subject_ids
[:,
:
1
].
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
output
.
shape
[
-
1
]))
end
=
torch
.
gather
(
output
,
dim
=
1
,
index
=
subject_ids
[:,
1
:].
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
output
.
shape
[
-
1
]))
subject
=
torch
.
cat
([
start
,
end
],
2
)
return
subject
[:,
0
]
def
forward
(
self
,
inputs
):
# 预测subject
seq_output
=
self
.
bert
(
inputs
[:
2
])
# [btz, seq_len, hdsz]
subject_preds
=
(
torch
.
sigmoid
(
self
.
linear1
(
seq_output
)))
**
2
# [btz, seq_len, 2]
# 传入subject,预测object
# 通过Conditional Layer Normalization将subject融入到object的预测中
subject_ids
=
inputs
[
2
]
# 理论上应该用LayerNorm前的,但是这样只能返回各个block顶层输出,这里和keras实现不一致
subject
=
self
.
extract_subject
([
seq_output
,
subject_ids
])
output
=
self
.
condLayerNorm
([
seq_output
,
subject
])
output
=
(
torch
.
sigmoid
(
self
.
linear2
(
output
)))
**
4
object_preds
=
output
.
reshape
(
*
output
.
shape
[:
2
],
len
(
predicate2id
),
2
)
return
[
subject_preds
,
object_preds
]
def
predict_subject
(
self
,
inputs
):
self
.
eval
()
with
torch
.
no_grad
():
seq_output
=
self
.
bert
(
inputs
[:
2
])
# [btz, seq_len, hdsz]
subject_preds
=
(
torch
.
sigmoid
(
self
.
linear1
(
seq_output
)))
**
2
# [btz, seq_len, 2]
return
[
seq_output
,
subject_preds
]
def
predict_object
(
self
,
inputs
):
self
.
eval
()
with
torch
.
no_grad
():
seq_output
,
subject_ids
=
inputs
subject
=
self
.
extract_subject
([
seq_output
,
subject_ids
])
output
=
self
.
condLayerNorm
([
seq_output
,
subject
])
output
=
(
torch
.
sigmoid
(
self
.
linear2
(
output
)))
**
4
object_preds
=
output
.
reshape
(
*
output
.
shape
[:
2
],
len
(
predicate2id
),
2
)
return
object_preds
train_model
=
Model
().
to
(
device
)
class
BCELoss
(
nn
.
BCELoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
inputs
,
targets
):
subject_preds
,
object_preds
=
inputs
subject_labels
,
object_labels
,
mask
=
targets
# sujuect部分loss
subject_loss
=
super
().
forward
(
subject_preds
,
subject_labels
)
subject_loss
=
subject_loss
.
mean
(
dim
=-
1
)
subject_loss
=
(
subject_loss
*
mask
).
sum
()
/
mask
.
sum
()
# object部分loss
object_loss
=
super
().
forward
(
object_preds
,
object_labels
)
object_loss
=
object_loss
.
mean
(
dim
=-
1
).
sum
(
dim
=-
1
)
object_loss
=
(
object_loss
*
mask
).
sum
()
/
mask
.
sum
()
return
subject_loss
+
object_loss
train_model
.
compile
(
loss
=
BCELoss
(
reduction
=
'none'
),
optimizer
=
optim
.
Adam
(
train_model
.
parameters
(),
1e-5
))
def
extract_spoes
(
text
):
"""抽取输入text所包含的三元组
"""
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
([
token_ids
],
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
tensor
([
segment_ids
],
dtype
=
torch
.
long
,
device
=
device
)
# 抽取subject
seq_output
,
subject_preds
=
train_model
.
predict_subject
([
token_ids
,
segment_ids
])
subject_preds
[:,
[
0
,
-
1
]]
*=
0
# 首cls, 尾sep置为0
start
=
torch
.
where
(
subject_preds
[
0
,
:,
0
]
>
0.6
)[
0
]
end
=
torch
.
where
(
subject_preds
[
0
,
:,
1
]
>
0.5
)[
0
]
subjects
=
[]
for
i
in
start
:
j
=
end
[
end
>=
i
]
if
len
(
j
)
>
0
:
j
=
j
[
0
]
subjects
.
append
((
i
.
item
(),
j
.
item
()))
if
subjects
:
spoes
=
[]
# token_ids = token_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
# segment_ids = segment_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
seq_output
=
seq_output
.
repeat
([
len
(
subjects
)]
+
[
1
]
*
(
len
(
seq_output
.
shape
)
-
1
))
subjects
=
torch
.
tensor
(
subjects
,
dtype
=
torch
.
long
,
device
=
device
)
# 传入subject,抽取object和predicate
object_preds
=
train_model
.
predict_object
([
seq_output
,
subjects
])
object_preds
[:,
[
0
,
-
1
]]
*=
0
for
subject
,
object_pred
in
zip
(
subjects
,
object_preds
):
start
=
torch
.
where
(
object_pred
[:,
:,
0
]
>
0.6
)
end
=
torch
.
where
(
object_pred
[:,
:,
1
]
>
0.5
)
for
_start
,
predicate1
in
zip
(
*
start
):
for
_end
,
predicate2
in
zip
(
*
end
):
if
_start
<=
_end
and
predicate1
==
predicate2
:
spoes
.
append
(
((
mapping
[
subject
[
0
]][
0
],
mapping
[
subject
[
1
]][
-
1
]),
predicate1
.
item
(),
(
mapping
[
_start
][
0
],
mapping
[
_end
][
-
1
]))
)
break
return
[(
text
[
s
[
0
]:
s
[
1
]
+
1
],
id2predicate
[
p
],
text
[
o
[
0
]:
o
[
1
]
+
1
])
for
s
,
p
,
o
,
in
spoes
]
else
:
return
[]
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])),
)
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
R
=
set
([
SPO
(
spo
)
for
spo
in
extract_spoes
(
d
[
'text'
])])
T
=
set
([
SPO
(
spo
)
for
spo
in
d
[
'spo_list'
]])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f'
%
(
f1
,
precision
,
recall
)
)
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
),
},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# optimizer.apply_ema_weights()
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# train_model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
)
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
train_model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
train_model
.
load_weights
(
'best_model.pt'
)
examples/relation_extraction/task_relation_extraction_gplinker.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于GlobalPointer的仿TPLinker设计
# 文章介绍:https://kexue.fm/archives/8888
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
from
bert4torch.layers
import
GlobalPointer
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.losses
import
SparseMultilabelCategoricalCrossentropy
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
import
numpy
as
np
maxlen
=
128
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
collate_fn
(
batch
):
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
=
[],
[],
[]
for
d
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
d
[
'text'
],
maxlen
=
maxlen
)
# 整理三元组 {s: [(o, p)]}
spoes
=
set
()
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s
,
token_ids
)
oh
=
search
(
o
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
spoes
.
add
((
sh
,
sh
+
len
(
s
)
-
1
,
p
,
oh
,
oh
+
len
(
o
)
-
1
))
# 构建标签
entity_labels
=
[
set
()
for
_
in
range
(
2
)]
head_labels
=
[
set
()
for
_
in
range
(
len
(
predicate2id
))]
tail_labels
=
[
set
()
for
_
in
range
(
len
(
predicate2id
))]
for
sh
,
st
,
p
,
oh
,
ot
in
spoes
:
entity_labels
[
0
].
add
((
sh
,
st
))
entity_labels
[
1
].
add
((
oh
,
ot
))
head_labels
[
p
].
add
((
sh
,
oh
))
tail_labels
[
p
].
add
((
st
,
ot
))
for
label
in
entity_labels
+
head_labels
+
tail_labels
:
if
not
label
:
# 至少要有一个标签
label
.
add
((
0
,
0
))
# 如果没有则用0填充
entity_labels
=
sequence_padding
([
list
(
l
)
for
l
in
entity_labels
])
# [subject/object=2, 实体个数, 实体起终点]
head_labels
=
sequence_padding
([
list
(
l
)
for
l
in
head_labels
])
# [关系个数, 该关系下subject/object配对数, subject/object起点]
tail_labels
=
sequence_padding
([
list
(
l
)
for
l
in
tail_labels
])
# [关系个数, 该关系下subject/object配对数, subject/object终点]
# 构建batch
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_entity_labels
.
append
(
entity_labels
)
batch_head_labels
.
append
(
head_labels
)
batch_tail_labels
.
append
(
tail_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# batch_entity_labels: [btz, subject/object=2, 实体个数, 实体起终点]
# batch_head_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object起点]
# batch_tail_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object终点]
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
batch_head_labels
=
torch
.
tensor
(
sequence_padding
(
batch_head_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
batch_tail_labels
=
torch
.
tensor
(
sequence_padding
(
batch_tail_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
)
self
.
entity_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
2
,
head_size
=
64
)
self
.
head_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
len
(
predicate2id
),
head_size
=
64
,
RoPE
=
False
,
tril_mask
=
False
)
self
.
tail_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
len
(
predicate2id
),
head_size
=
64
,
RoPE
=
False
,
tril_mask
=
False
)
def
forward
(
self
,
inputs
):
hidden_states
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
mask
=
inputs
[
0
].
gt
(
0
).
long
()
entity_output
=
self
.
entity_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
head_output
=
self
.
head_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
tail_output
=
self
.
tail_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
return
entity_output
,
head_output
,
tail_output
model
=
Model
().
to
(
device
)
class
MyLoss
(
SparseMultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_trues
):
''' y_preds: [Tensor], shape为[btz, heads, seq_len ,seq_len]
'''
loss_list
=
[]
for
y_pred
,
y_true
in
zip
(
y_preds
,
y_trues
):
shape
=
y_pred
.
shape
# 乘以seq_len是因为(i, j)在展开到seq_len*seq_len维度对应的下标是i*seq_len+j
y_true
=
y_true
[...,
0
]
*
shape
[
2
]
+
y_true
[...,
1
]
# [btz, heads, 实体起终点的下标]
y_pred
=
y_pred
.
reshape
(
shape
[
0
],
-
1
,
np
.
prod
(
shape
[
2
:]))
# [btz, heads, seq_len*seq_len]
loss
=
super
().
forward
(
y_pred
,
y_true
.
long
())
loss
=
torch
.
mean
(
torch
.
sum
(
loss
,
dim
=
1
))
loss_list
.
append
(
loss
)
return
{
'loss'
:
sum
(
loss_list
)
/
3
,
'entity_loss'
:
loss_list
[
0
],
'head_loss'
:
loss_list
[
1
],
'tail_loss'
:
loss_list
[
2
]}
model
.
compile
(
loss
=
MyLoss
(
mask_zero
=
True
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'entity_loss'
,
'head_loss'
,
'tail_loss'
])
def
extract_spoes
(
text
,
threshold
=
0
):
"""抽取输入text所包含的三元组
"""
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
([
token_ids
],
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
tensor
([
segment_ids
],
dtype
=
torch
.
long
,
device
=
device
)
outputs
=
model
.
predict
([
token_ids
,
segment_ids
])
outputs
=
[
o
[
0
].
cpu
().
numpy
()
for
o
in
outputs
]
# [heads, seq_len, seq_len]
# 抽取subject和object
subjects
,
objects
=
set
(),
set
()
outputs
[
0
][:,
[
0
,
-
1
]]
-=
float
(
'inf'
)
outputs
[
0
][:,
:,
[
0
,
-
1
]]
-=
float
(
'inf'
)
for
l
,
h
,
t
in
zip
(
*
np
.
where
(
outputs
[
0
]
>
threshold
)):
if
l
==
0
:
subjects
.
add
((
h
,
t
))
else
:
objects
.
add
((
h
,
t
))
# 识别对应的predicate
spoes
=
set
()
for
sh
,
st
in
subjects
:
for
oh
,
ot
in
objects
:
p1s
=
np
.
where
(
outputs
[
1
][:,
sh
,
oh
]
>
threshold
)[
0
]
p2s
=
np
.
where
(
outputs
[
2
][:,
st
,
ot
]
>
threshold
)[
0
]
ps
=
set
(
p1s
)
&
set
(
p2s
)
for
p
in
ps
:
spoes
.
add
((
text
[
mapping
[
sh
][
0
]:
mapping
[
st
][
-
1
]
+
1
],
id2predicate
[
p
],
text
[
mapping
[
oh
][
0
]:
mapping
[
ot
][
-
1
]
+
1
]
))
return
list
(
spoes
)
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])))
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
R
=
set
([
SPO
(
spo
)
for
spo
in
extract_spoes
(
d
[
'text'
])])
T
=
set
([
SPO
(
spo
)
for
spo
in
d
[
'spo_list'
]])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f'
%
(
f1
,
precision
,
recall
))
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
)},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# optimizer.apply_ema_weights()
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/relation_extraction/task_relation_extraction_tplinker.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker, cat方式实体部分收敛较快,关系部分收敛较慢
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.layers
import
TplinkerHandshakingKernel
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
maxlen
=
50
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
trans_ij2k
(
seq_len
,
i
,
j
):
'''把第i行,第j列转化成上三角flat后的序号
'''
if
(
i
>
seq_len
-
1
)
or
(
j
>
seq_len
-
1
)
or
(
i
>
j
):
return
0
return
int
(
0.5
*
(
2
*
seq_len
-
i
+
1
)
*
i
+
(
j
-
i
))
map_ij2k
=
{(
i
,
j
):
trans_ij2k
(
maxlen
,
i
,
j
)
for
i
in
range
(
maxlen
)
for
j
in
range
(
maxlen
)
if
j
>=
i
}
map_k2ij
=
{
v
:
k
for
k
,
v
in
map_ij2k
.
items
()}
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
def
collate_fn
(
batch
):
pair_len
=
maxlen
*
(
maxlen
+
1
)
//
2
# batch_entity_labels: [btz, pair_len]
# batch_head_labels: [btz, rel_size, pair_len]
# batch_tail_labels: [btz, rel_size, pair_len]
batch_entity_labels
=
torch
.
zeros
((
len
(
batch
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_head_labels
=
torch
.
zeros
((
len
(
batch
),
len
(
predicate2id
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_tail_labels
=
torch
.
zeros
((
len
(
batch
),
len
(
predicate2id
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token_ids
=
[]
for
i
,
d
in
enumerate
(
batch
):
token_ids
=
tokenizer
.
encode
(
d
[
'text'
])[
0
][
1
:
-
1
][:
maxlen
]
# 这里要限制取前max_len个
batch_token_ids
.
append
(
token_ids
)
# 整理三元组 {s: [(o, p)]}
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s
,
token_ids
)
# 这里超过长度就会找不到
oh
=
search
(
o
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
st
,
ot
=
sh
+
len
(
s
)
-
1
,
oh
+
len
(
o
)
-
1
batch_entity_labels
[
i
,
map_ij2k
[
sh
,
st
]]
=
1
batch_entity_labels
[
i
,
map_ij2k
[
oh
,
ot
]]
=
1
if
sh
<=
oh
:
batch_head_labels
[
i
,
p
,
map_ij2k
[
sh
,
oh
]]
=
1
else
:
batch_head_labels
[
i
,
p
,
map_ij2k
[
oh
,
sh
]]
=
2
if
st
<=
ot
:
batch_tail_labels
[
i
,
p
,
map_ij2k
[
st
,
ot
]]
=
1
else
:
batch_tail_labels
[
i
,
p
,
map_ij2k
[
ot
,
st
]]
=
2
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
[
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
combine_fc
=
nn
.
Linear
(
768
*
2
,
768
)
self
.
ent_fc
=
nn
.
Linear
(
768
,
2
)
self
.
head_rel_fc
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
3
)
self
.
tail_rel_fc
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
3
)
self
.
handshaking_kernel
=
TplinkerHandshakingKernel
(
768
,
shaking_type
=
'cat'
)
def
forward
(
self
,
inputs
):
last_hidden_state
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
shaking_hiddens
=
self
.
handshaking_kernel
(
last_hidden_state
)
# [btz, pair_len, hdsz]
ent_shaking_outputs
=
self
.
ent_fc
(
shaking_hiddens
)
# [btz, pair_len, 2]
btz
,
pair_len
=
shaking_hiddens
.
shape
[:
2
]
head_rel_shaking_outputs
=
self
.
head_rel_fc
(
shaking_hiddens
).
reshape
(
btz
,
-
1
,
pair_len
,
3
)
#[btz, predicate_num, pair_len, 3]
tail_rel_shaking_outputs
=
self
.
tail_rel_fc
(
shaking_hiddens
).
reshape
(
btz
,
-
1
,
pair_len
,
3
)
return
ent_shaking_outputs
,
head_rel_shaking_outputs
,
tail_rel_shaking_outputs
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_trues
):
loss_list
=
[]
for
y_pred
,
y_true
in
zip
(
y_preds
,
y_trues
):
loss
=
super
().
forward
(
y_pred
.
view
(
-
1
,
y_pred
.
size
()[
-
1
]),
y_true
.
view
(
-
1
))
loss_list
.
append
(
loss
)
z
=
(
2
*
len
(
predicate2id
)
+
1
)
total_steps
=
6000
# 前期实体识别的权重高一些,建议也可以设置为model.total_steps
w_ent
=
max
(
1
/
z
+
1
-
model
.
global_step
/
total_steps
,
1
/
z
)
w_rel
=
min
((
len
(
predicate2id
)
/
z
)
*
model
.
global_step
/
total_steps
,
(
len
(
predicate2id
)
/
z
))
loss
=
w_ent
*
loss_list
[
0
]
+
w_rel
*
loss_list
[
1
]
+
w_rel
*
loss_list
[
2
]
return
{
'loss'
:
loss
,
'entity_loss'
:
loss_list
[
0
],
'head_loss'
:
loss_list
[
1
],
'tail_loss'
:
loss_list
[
2
]}
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
5e-5
),
metrics
=
[
'entity_loss'
,
'head_loss'
,
'tail_loss'
])
def
extract_spoes
(
text
):
"""抽取输入text所包含的三元组
"""
def
get_spots_fr_shaking_tag
(
shaking_tag
):
'''解析关系
'''
spots
=
[]
for
shaking_inds
in
shaking_tag
.
nonzero
():
rel_id
=
shaking_inds
[
0
].
item
()
tag_id
=
shaking_tag
[
rel_id
][
shaking_inds
[
1
]].
item
()
matrix_inds
=
map_k2ij
[
shaking_inds
[
1
].
item
()]
# 保证前面是subject,后面是object
if
tag_id
==
1
:
spot
=
(
rel_id
,
matrix_inds
[
0
],
matrix_inds
[
1
])
elif
tag_id
==
2
:
spot
=
(
rel_id
,
matrix_inds
[
1
],
matrix_inds
[
0
])
spots
.
append
(
spot
)
return
spots
tokens
=
tokenizer
.
tokenize
(
text
)[
1
:
-
1
]
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
=
tokenizer
.
encode
(
text
)[
0
][
1
:
-
1
]
token_ids_ts
=
torch
.
tensor
(
sequence_padding
([
token_ids
],
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
outputs
=
model
.
predict
([
token_ids_ts
])
outputs
=
[
o
[
0
].
argmax
(
dim
=-
1
)
for
o
in
outputs
]
# 抽取entity
ent_matrix_spots
=
set
()
ent_text
=
set
()
for
shaking_ind
in
outputs
[
0
].
nonzero
():
shaking_ind_
=
shaking_ind
[
0
].
item
()
# tag_id = outputs[0][shaking_ind_]
matrix_inds
=
map_k2ij
[
shaking_ind_
]
spot
=
(
matrix_inds
[
0
],
matrix_inds
[
1
])
if
(
spot
[
0
]
<
len
(
mapping
))
and
(
spot
[
1
]
<
len
(
mapping
)):
# 实体起始在mapping范围内
ent_matrix_spots
.
add
(
spot
)
ent_text
.
add
(
text
[
mapping
[
spot
[
0
]][
0
]:
mapping
[
spot
[
1
]][
-
1
]
+
1
])
# 识别对应的predicate
head_rel_matrix_spots
=
get_spots_fr_shaking_tag
(
outputs
[
1
])
tail_rel_matrix_spots
=
get_spots_fr_shaking_tag
(
outputs
[
2
])
spoes
=
[]
for
rel_h
,
sh
,
oh
in
head_rel_matrix_spots
:
for
rel_t
,
st
,
ot
in
tail_rel_matrix_spots
:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if
(
rel_h
==
rel_t
)
and
((
sh
,
st
)
in
ent_matrix_spots
)
and
((
oh
,
ot
)
in
ent_matrix_spots
):
spoes
.
append
((
text
[
mapping
[
sh
][
0
]:
mapping
[
st
][
-
1
]
+
1
],
id2predicate
[
rel_h
],
text
[
mapping
[
oh
][
0
]:
mapping
[
ot
][
-
1
]
+
1
]))
return
spoes
,
token_ids
,
ent_text
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])))
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
E1
,
E2
=
0
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
spoes
,
token_ids
,
ent_text_pred
=
extract_spoes
(
d
[
'text'
])
# spo_list是用来根据maxlen删减的
spo_list
=
[]
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s_
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
o_
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s_
,
token_ids
)
# 这里超过长度就会找不到
oh
=
search
(
o_
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
spo_list
.
append
((
s
,
p
,
o
))
# 计算三元组的f1值
R
=
set
([
SPO
(
spo
)
for
spo
in
spoes
])
T
=
set
([
SPO
(
spo
)
for
spo
in
spo_list
])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
# 计算实体的指标
ent_text_truth
=
set
([
spo
[
0
]
for
spo
in
spo_list
]
+
[
spo
[
-
1
]
for
spo
in
spo_list
])
E1
+=
len
(
ent_text_pred
&
ent_text_truth
)
E2
+=
len
(
ent_text_truth
)
E_acc
=
E1
/
E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f'
%
(
f1
,
precision
,
recall
,
E_acc
))
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
)},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
Prev
1
2
3
4
5
6
7
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment