Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
0e29b9b7
Commit
0e29b9b7
authored
Jan 20, 2026
by
xuxo
Browse files
yidong infer init
parents
Pipeline
#3252
failed with stages
in 0 seconds
Changes
150
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3291 additions
and
0 deletions
+3291
-0
bert/bert4torch_cmcc/examples/convert_script/convert_bart__cloudwalk.py
...h_cmcc/examples/convert_script/convert_bart__cloudwalk.py
+270
-0
bert/bert4torch_cmcc/examples/convert_script/convert_bart_fudanNLP.py
...rch_cmcc/examples/convert_script/convert_bart_fudanNLP.py
+34
-0
bert/bert4torch_cmcc/examples/convert_script/convert_bert-base-chinese.py
...cmcc/examples/convert_script/convert_bert-base-chinese.py
+52
-0
bert/bert4torch_cmcc/examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
...cmcc/examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
+107
-0
bert/bert4torch_cmcc/examples/convert_script/convert_gpt2__gpt2-ml.py
...rch_cmcc/examples/convert_script/convert_gpt2__gpt2-ml.py
+109
-0
bert/bert4torch_cmcc/examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
...cc/examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
+106
-0
bert/bert4torch_cmcc/examples/convert_script/convert_nezha_gpt_dialog.py
..._cmcc/examples/convert_script/convert_nezha_gpt_dialog.py
+75
-0
bert/bert4torch_cmcc/examples/convert_script/convert_roberta_chess.py
...rch_cmcc/examples/convert_script/convert_roberta_chess.py
+80
-0
bert/bert4torch_cmcc/examples/convert_script/convert_t5_pegasus.py
...4torch_cmcc/examples/convert_script/convert_t5_pegasus.py
+107
-0
bert/bert4torch_cmcc/examples/convert_script/convert_transformer_xl.py
...ch_cmcc/examples/convert_script/convert_transformer_xl.py
+104
-0
bert/bert4torch_cmcc/examples/others/task_conditional_language_model.py
...h_cmcc/examples/others/task_conditional_language_model.py
+177
-0
bert/bert4torch_cmcc/examples/others/task_iflytek_bert_of_theseus.py
...orch_cmcc/examples/others/task_iflytek_bert_of_theseus.py
+212
-0
bert/bert4torch_cmcc/examples/others/task_language_model.py
bert/bert4torch_cmcc/examples/others/task_language_model.py
+175
-0
bert/bert4torch_cmcc/examples/others/task_language_model_chinese_chess.py
...cmcc/examples/others/task_language_model_chinese_chess.py
+214
-0
bert/bert4torch_cmcc/examples/others/task_nl2sql_baseline.py
bert/bert4torch_cmcc/examples/others/task_nl2sql_baseline.py
+380
-0
bert/bert4torch_cmcc/examples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
...xamples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
+151
-0
bert/bert4torch_cmcc/examples/pretrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
...retrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
+223
-0
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
...xamples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
+264
-0
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
...xamples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
+287
-0
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
...les/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
+164
-0
No files found.
Too many changes to show.
To preserve performance only
150 of 150+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/convert_script/convert_bart__cloudwalk.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 将cloudwalk的预训练bart模型转换为bert4torch可用的权重
# 权重链接百度云地址:
import
torch
ckpt_file
=
'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/pytorch_base_model_2024000.pt'
torch_weights
=
torch
.
load
(
ckpt_file
)
map
=
{
'bart.embeddings.word_embeddings.weight'
:
'encoder.embed_tokens.weight'
,
'bart.embeddings.position_embeddings.weight'
:
'encoder.embed_positions.weight'
,
'bart.embeddings.LayerNorm.weight'
:
'encoder.layernorm_embedding.weight'
,
'bart.embeddings.LayerNorm.bias'
:
'encoder.layernorm_embedding.bias'
,
'bart.encoder.encoder_layer.0.attention.self.query.weight'
:
'encoder.layers.0.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.0.attention.self.query.bias'
:
'encoder.layers.0.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.0.attention.self.key.weight'
:
'encoder.layers.0.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.0.attention.self.key.bias'
:
'encoder.layers.0.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.0.attention.self.value.weight'
:
'encoder.layers.0.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.0.attention.self.value.bias'
:
'encoder.layers.0.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.0.attention.output.dense.weight'
:
'encoder.layers.0.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.0.attention.output.dense.bias'
:
'encoder.layers.0.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.weight'
:
'encoder.layers.0.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.0.attention.output.LayerNorm.bias'
:
'encoder.layers.0.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.0.intermediate.dense.weight'
:
'encoder.layers.0.fc1.weight'
,
'bart.encoder.encoder_layer.0.intermediate.dense.bias'
:
'encoder.layers.0.fc1.bias'
,
'bart.encoder.encoder_layer.0.output.dense.weight'
:
'encoder.layers.0.fc2.weight'
,
'bart.encoder.encoder_layer.0.output.dense.bias'
:
'encoder.layers.0.fc2.bias'
,
'bart.encoder.encoder_layer.0.output.LayerNorm.weight'
:
'encoder.layers.0.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.0.output.LayerNorm.bias'
:
'encoder.layers.0.final_layer_norm.bias'
,
'bart.encoder.encoder_layer.1.attention.self.query.weight'
:
'encoder.layers.1.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.1.attention.self.query.bias'
:
'encoder.layers.1.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.1.attention.self.key.weight'
:
'encoder.layers.1.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.1.attention.self.key.bias'
:
'encoder.layers.1.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.1.attention.self.value.weight'
:
'encoder.layers.1.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.1.attention.self.value.bias'
:
'encoder.layers.1.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.1.attention.output.dense.weight'
:
'encoder.layers.1.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.1.attention.output.dense.bias'
:
'encoder.layers.1.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.weight'
:
'encoder.layers.1.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.1.attention.output.LayerNorm.bias'
:
'encoder.layers.1.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.1.intermediate.dense.weight'
:
'encoder.layers.1.fc1.weight'
,
'bart.encoder.encoder_layer.1.intermediate.dense.bias'
:
'encoder.layers.1.fc1.bias'
,
'bart.encoder.encoder_layer.1.output.dense.weight'
:
'encoder.layers.1.fc2.weight'
,
'bart.encoder.encoder_layer.1.output.dense.bias'
:
'encoder.layers.1.fc2.bias'
,
'bart.encoder.encoder_layer.1.output.LayerNorm.weight'
:
'encoder.layers.1.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.1.output.LayerNorm.bias'
:
'encoder.layers.1.final_layer_norm.bias'
,
'bart.encoder.encoder_layer.2.attention.self.query.weight'
:
'encoder.layers.2.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.2.attention.self.query.bias'
:
'encoder.layers.2.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.2.attention.self.key.weight'
:
'encoder.layers.2.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.2.attention.self.key.bias'
:
'encoder.layers.2.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.2.attention.self.value.weight'
:
'encoder.layers.2.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.2.attention.self.value.bias'
:
'encoder.layers.2.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.2.attention.output.dense.weight'
:
'encoder.layers.2.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.2.attention.output.dense.bias'
:
'encoder.layers.2.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.weight'
:
'encoder.layers.2.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.2.attention.output.LayerNorm.bias'
:
'encoder.layers.2.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.2.intermediate.dense.weight'
:
'encoder.layers.2.fc1.weight'
,
'bart.encoder.encoder_layer.2.intermediate.dense.bias'
:
'encoder.layers.2.fc1.bias'
,
'bart.encoder.encoder_layer.2.output.dense.weight'
:
'encoder.layers.2.fc2.weight'
,
'bart.encoder.encoder_layer.2.output.dense.bias'
:
'encoder.layers.2.fc2.bias'
,
'bart.encoder.encoder_layer.2.output.LayerNorm.weight'
:
'encoder.layers.2.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.2.output.LayerNorm.bias'
:
'encoder.layers.2.final_layer_norm.bias'
,
'bart.encoder.encoder_layer.3.attention.self.query.weight'
:
'encoder.layers.3.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.3.attention.self.query.bias'
:
'encoder.layers.3.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.3.attention.self.key.weight'
:
'encoder.layers.3.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.3.attention.self.key.bias'
:
'encoder.layers.3.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.3.attention.self.value.weight'
:
'encoder.layers.3.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.3.attention.self.value.bias'
:
'encoder.layers.3.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.3.attention.output.dense.weight'
:
'encoder.layers.3.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.3.attention.output.dense.bias'
:
'encoder.layers.3.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.weight'
:
'encoder.layers.3.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.3.attention.output.LayerNorm.bias'
:
'encoder.layers.3.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.3.intermediate.dense.weight'
:
'encoder.layers.3.fc1.weight'
,
'bart.encoder.encoder_layer.3.intermediate.dense.bias'
:
'encoder.layers.3.fc1.bias'
,
'bart.encoder.encoder_layer.3.output.dense.weight'
:
'encoder.layers.3.fc2.weight'
,
'bart.encoder.encoder_layer.3.output.dense.bias'
:
'encoder.layers.3.fc2.bias'
,
'bart.encoder.encoder_layer.3.output.LayerNorm.weight'
:
'encoder.layers.3.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.3.output.LayerNorm.bias'
:
'encoder.layers.3.final_layer_norm.bias'
,
'bart.encoder.encoder_layer.4.attention.self.query.weight'
:
'encoder.layers.4.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.4.attention.self.query.bias'
:
'encoder.layers.4.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.4.attention.self.key.weight'
:
'encoder.layers.4.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.4.attention.self.key.bias'
:
'encoder.layers.4.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.4.attention.self.value.weight'
:
'encoder.layers.4.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.4.attention.self.value.bias'
:
'encoder.layers.4.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.4.attention.output.dense.weight'
:
'encoder.layers.4.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.4.attention.output.dense.bias'
:
'encoder.layers.4.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.weight'
:
'encoder.layers.4.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.4.attention.output.LayerNorm.bias'
:
'encoder.layers.4.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.4.intermediate.dense.weight'
:
'encoder.layers.4.fc1.weight'
,
'bart.encoder.encoder_layer.4.intermediate.dense.bias'
:
'encoder.layers.4.fc1.bias'
,
'bart.encoder.encoder_layer.4.output.dense.weight'
:
'encoder.layers.4.fc2.weight'
,
'bart.encoder.encoder_layer.4.output.dense.bias'
:
'encoder.layers.4.fc2.bias'
,
'bart.encoder.encoder_layer.4.output.LayerNorm.weight'
:
'encoder.layers.4.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.4.output.LayerNorm.bias'
:
'encoder.layers.4.final_layer_norm.bias'
,
'bart.encoder.encoder_layer.5.attention.self.query.weight'
:
'encoder.layers.5.self_attn.q_proj.weight'
,
'bart.encoder.encoder_layer.5.attention.self.query.bias'
:
'encoder.layers.5.self_attn.q_proj.bias'
,
'bart.encoder.encoder_layer.5.attention.self.key.weight'
:
'encoder.layers.5.self_attn.k_proj.weight'
,
'bart.encoder.encoder_layer.5.attention.self.key.bias'
:
'encoder.layers.5.self_attn.k_proj.bias'
,
'bart.encoder.encoder_layer.5.attention.self.value.weight'
:
'encoder.layers.5.self_attn.v_proj.weight'
,
'bart.encoder.encoder_layer.5.attention.self.value.bias'
:
'encoder.layers.5.self_attn.v_proj.bias'
,
'bart.encoder.encoder_layer.5.attention.output.dense.weight'
:
'encoder.layers.5.self_attn.out_proj.weight'
,
'bart.encoder.encoder_layer.5.attention.output.dense.bias'
:
'encoder.layers.5.self_attn.out_proj.bias'
,
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.weight'
:
'encoder.layers.5.self_attn_layer_norm.weight'
,
'bart.encoder.encoder_layer.5.attention.output.LayerNorm.bias'
:
'encoder.layers.5.self_attn_layer_norm.bias'
,
'bart.encoder.encoder_layer.5.intermediate.dense.weight'
:
'encoder.layers.5.fc1.weight'
,
'bart.encoder.encoder_layer.5.intermediate.dense.bias'
:
'encoder.layers.5.fc1.bias'
,
'bart.encoder.encoder_layer.5.output.dense.weight'
:
'encoder.layers.5.fc2.weight'
,
'bart.encoder.encoder_layer.5.output.dense.bias'
:
'encoder.layers.5.fc2.bias'
,
'bart.encoder.encoder_layer.5.output.LayerNorm.weight'
:
'encoder.layers.5.final_layer_norm.weight'
,
'bart.encoder.encoder_layer.5.output.LayerNorm.bias'
:
'encoder.layers.5.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.0.attention.self.query.weight'
:
'decoder.layers.0.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.0.attention.self.query.bias'
:
'decoder.layers.0.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.0.attention.self.key.weight'
:
'decoder.layers.0.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.0.attention.self.key.bias'
:
'decoder.layers.0.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.0.attention.self.value.weight'
:
'decoder.layers.0.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.0.attention.self.value.bias'
:
'decoder.layers.0.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.0.attention.output.dense.weight'
:
'decoder.layers.0.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.0.attention.output.dense.bias'
:
'decoder.layers.0.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.weight'
:
'decoder.layers.0.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.0.attention.output.LayerNorm.bias'
:
'decoder.layers.0.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.0.crossattention.self.query.weight'
:
'decoder.layers.0.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.0.crossattention.self.query.bias'
:
'decoder.layers.0.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.0.crossattention.self.key.weight'
:
'decoder.layers.0.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.0.crossattention.self.key.bias'
:
'decoder.layers.0.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.0.crossattention.self.value.weight'
:
'decoder.layers.0.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.0.crossattention.self.value.bias'
:
'decoder.layers.0.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.0.crossattention.output.dense.weight'
:
'decoder.layers.0.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.0.crossattention.output.dense.bias'
:
'decoder.layers.0.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.weight'
:
'decoder.layers.0.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.0.crossattention.output.LayerNorm.bias'
:
'decoder.layers.0.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.0.intermediate.dense.weight'
:
'decoder.layers.0.fc1.weight'
,
'bart.decoder.decoder_layer.0.intermediate.dense.bias'
:
'decoder.layers.0.fc1.bias'
,
'bart.decoder.decoder_layer.0.output.dense.weight'
:
'decoder.layers.0.fc2.weight'
,
'bart.decoder.decoder_layer.0.output.dense.bias'
:
'decoder.layers.0.fc2.bias'
,
'bart.decoder.decoder_layer.0.output.LayerNorm.weight'
:
'decoder.layers.0.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.0.output.LayerNorm.bias'
:
'decoder.layers.0.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.1.attention.self.query.weight'
:
'decoder.layers.1.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.1.attention.self.query.bias'
:
'decoder.layers.1.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.1.attention.self.key.weight'
:
'decoder.layers.1.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.1.attention.self.key.bias'
:
'decoder.layers.1.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.1.attention.self.value.weight'
:
'decoder.layers.1.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.1.attention.self.value.bias'
:
'decoder.layers.1.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.1.attention.output.dense.weight'
:
'decoder.layers.1.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.1.attention.output.dense.bias'
:
'decoder.layers.1.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.weight'
:
'decoder.layers.1.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.1.attention.output.LayerNorm.bias'
:
'decoder.layers.1.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.1.crossattention.self.query.weight'
:
'decoder.layers.1.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.1.crossattention.self.query.bias'
:
'decoder.layers.1.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.1.crossattention.self.key.weight'
:
'decoder.layers.1.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.1.crossattention.self.key.bias'
:
'decoder.layers.1.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.1.crossattention.self.value.weight'
:
'decoder.layers.1.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.1.crossattention.self.value.bias'
:
'decoder.layers.1.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.1.crossattention.output.dense.weight'
:
'decoder.layers.1.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.1.crossattention.output.dense.bias'
:
'decoder.layers.1.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.weight'
:
'decoder.layers.1.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.1.crossattention.output.LayerNorm.bias'
:
'decoder.layers.1.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.1.intermediate.dense.weight'
:
'decoder.layers.1.fc1.weight'
,
'bart.decoder.decoder_layer.1.intermediate.dense.bias'
:
'decoder.layers.1.fc1.bias'
,
'bart.decoder.decoder_layer.1.output.dense.weight'
:
'decoder.layers.1.fc2.weight'
,
'bart.decoder.decoder_layer.1.output.dense.bias'
:
'decoder.layers.1.fc2.bias'
,
'bart.decoder.decoder_layer.1.output.LayerNorm.weight'
:
'decoder.layers.1.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.1.output.LayerNorm.bias'
:
'decoder.layers.1.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.2.attention.self.query.weight'
:
'decoder.layers.2.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.2.attention.self.query.bias'
:
'decoder.layers.2.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.2.attention.self.key.weight'
:
'decoder.layers.2.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.2.attention.self.key.bias'
:
'decoder.layers.2.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.2.attention.self.value.weight'
:
'decoder.layers.2.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.2.attention.self.value.bias'
:
'decoder.layers.2.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.2.attention.output.dense.weight'
:
'decoder.layers.2.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.2.attention.output.dense.bias'
:
'decoder.layers.2.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.weight'
:
'decoder.layers.2.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.2.attention.output.LayerNorm.bias'
:
'decoder.layers.2.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.2.crossattention.self.query.weight'
:
'decoder.layers.2.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.2.crossattention.self.query.bias'
:
'decoder.layers.2.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.2.crossattention.self.key.weight'
:
'decoder.layers.2.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.2.crossattention.self.key.bias'
:
'decoder.layers.2.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.2.crossattention.self.value.weight'
:
'decoder.layers.2.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.2.crossattention.self.value.bias'
:
'decoder.layers.2.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.2.crossattention.output.dense.weight'
:
'decoder.layers.2.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.2.crossattention.output.dense.bias'
:
'decoder.layers.2.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.weight'
:
'decoder.layers.2.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.2.crossattention.output.LayerNorm.bias'
:
'decoder.layers.2.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.2.intermediate.dense.weight'
:
'decoder.layers.2.fc1.weight'
,
'bart.decoder.decoder_layer.2.intermediate.dense.bias'
:
'decoder.layers.2.fc1.bias'
,
'bart.decoder.decoder_layer.2.output.dense.weight'
:
'decoder.layers.2.fc2.weight'
,
'bart.decoder.decoder_layer.2.output.dense.bias'
:
'decoder.layers.2.fc2.bias'
,
'bart.decoder.decoder_layer.2.output.LayerNorm.weight'
:
'decoder.layers.2.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.2.output.LayerNorm.bias'
:
'decoder.layers.2.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.3.attention.self.query.weight'
:
'decoder.layers.3.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.3.attention.self.query.bias'
:
'decoder.layers.3.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.3.attention.self.key.weight'
:
'decoder.layers.3.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.3.attention.self.key.bias'
:
'decoder.layers.3.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.3.attention.self.value.weight'
:
'decoder.layers.3.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.3.attention.self.value.bias'
:
'decoder.layers.3.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.3.attention.output.dense.weight'
:
'decoder.layers.3.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.3.attention.output.dense.bias'
:
'decoder.layers.3.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.weight'
:
'decoder.layers.3.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.3.attention.output.LayerNorm.bias'
:
'decoder.layers.3.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.3.crossattention.self.query.weight'
:
'decoder.layers.3.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.3.crossattention.self.query.bias'
:
'decoder.layers.3.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.3.crossattention.self.key.weight'
:
'decoder.layers.3.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.3.crossattention.self.key.bias'
:
'decoder.layers.3.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.3.crossattention.self.value.weight'
:
'decoder.layers.3.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.3.crossattention.self.value.bias'
:
'decoder.layers.3.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.3.crossattention.output.dense.weight'
:
'decoder.layers.3.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.3.crossattention.output.dense.bias'
:
'decoder.layers.3.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.weight'
:
'decoder.layers.3.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.3.crossattention.output.LayerNorm.bias'
:
'decoder.layers.3.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.3.intermediate.dense.weight'
:
'decoder.layers.3.fc1.weight'
,
'bart.decoder.decoder_layer.3.intermediate.dense.bias'
:
'decoder.layers.3.fc1.bias'
,
'bart.decoder.decoder_layer.3.output.dense.weight'
:
'decoder.layers.3.fc2.weight'
,
'bart.decoder.decoder_layer.3.output.dense.bias'
:
'decoder.layers.3.fc2.bias'
,
'bart.decoder.decoder_layer.3.output.LayerNorm.weight'
:
'decoder.layers.3.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.3.output.LayerNorm.bias'
:
'decoder.layers.3.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.4.attention.self.query.weight'
:
'decoder.layers.4.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.4.attention.self.query.bias'
:
'decoder.layers.4.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.4.attention.self.key.weight'
:
'decoder.layers.4.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.4.attention.self.key.bias'
:
'decoder.layers.4.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.4.attention.self.value.weight'
:
'decoder.layers.4.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.4.attention.self.value.bias'
:
'decoder.layers.4.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.4.attention.output.dense.weight'
:
'decoder.layers.4.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.4.attention.output.dense.bias'
:
'decoder.layers.4.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.weight'
:
'decoder.layers.4.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.4.attention.output.LayerNorm.bias'
:
'decoder.layers.4.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.4.crossattention.self.query.weight'
:
'decoder.layers.4.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.4.crossattention.self.query.bias'
:
'decoder.layers.4.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.4.crossattention.self.key.weight'
:
'decoder.layers.4.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.4.crossattention.self.key.bias'
:
'decoder.layers.4.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.4.crossattention.self.value.weight'
:
'decoder.layers.4.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.4.crossattention.self.value.bias'
:
'decoder.layers.4.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.4.crossattention.output.dense.weight'
:
'decoder.layers.4.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.4.crossattention.output.dense.bias'
:
'decoder.layers.4.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.weight'
:
'decoder.layers.4.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.4.crossattention.output.LayerNorm.bias'
:
'decoder.layers.4.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.4.intermediate.dense.weight'
:
'decoder.layers.4.fc1.weight'
,
'bart.decoder.decoder_layer.4.intermediate.dense.bias'
:
'decoder.layers.4.fc1.bias'
,
'bart.decoder.decoder_layer.4.output.dense.weight'
:
'decoder.layers.4.fc2.weight'
,
'bart.decoder.decoder_layer.4.output.dense.bias'
:
'decoder.layers.4.fc2.bias'
,
'bart.decoder.decoder_layer.4.output.LayerNorm.weight'
:
'decoder.layers.4.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.4.output.LayerNorm.bias'
:
'decoder.layers.4.final_layer_norm.bias'
,
'bart.decoder.decoder_layer.5.attention.self.query.weight'
:
'decoder.layers.5.self_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.5.attention.self.query.bias'
:
'decoder.layers.5.self_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.5.attention.self.key.weight'
:
'decoder.layers.5.self_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.5.attention.self.key.bias'
:
'decoder.layers.5.self_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.5.attention.self.value.weight'
:
'decoder.layers.5.self_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.5.attention.self.value.bias'
:
'decoder.layers.5.self_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.5.attention.output.dense.weight'
:
'decoder.layers.5.self_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.5.attention.output.dense.bias'
:
'decoder.layers.5.self_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.weight'
:
'decoder.layers.5.self_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.5.attention.output.LayerNorm.bias'
:
'decoder.layers.5.self_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.5.crossattention.self.query.weight'
:
'decoder.layers.5.encoder_attn.q_proj.weight'
,
'bart.decoder.decoder_layer.5.crossattention.self.query.bias'
:
'decoder.layers.5.encoder_attn.q_proj.bias'
,
'bart.decoder.decoder_layer.5.crossattention.self.key.weight'
:
'decoder.layers.5.encoder_attn.k_proj.weight'
,
'bart.decoder.decoder_layer.5.crossattention.self.key.bias'
:
'decoder.layers.5.encoder_attn.k_proj.bias'
,
'bart.decoder.decoder_layer.5.crossattention.self.value.weight'
:
'decoder.layers.5.encoder_attn.v_proj.weight'
,
'bart.decoder.decoder_layer.5.crossattention.self.value.bias'
:
'decoder.layers.5.encoder_attn.v_proj.bias'
,
'bart.decoder.decoder_layer.5.crossattention.output.dense.weight'
:
'decoder.layers.5.encoder_attn.out_proj.weight'
,
'bart.decoder.decoder_layer.5.crossattention.output.dense.bias'
:
'decoder.layers.5.encoder_attn.out_proj.bias'
,
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.weight'
:
'decoder.layers.5.encoder_attn_layer_norm.weight'
,
'bart.decoder.decoder_layer.5.crossattention.output.LayerNorm.bias'
:
'decoder.layers.5.encoder_attn_layer_norm.bias'
,
'bart.decoder.decoder_layer.5.intermediate.dense.weight'
:
'decoder.layers.5.fc1.weight'
,
'bart.decoder.decoder_layer.5.intermediate.dense.bias'
:
'decoder.layers.5.fc1.bias'
,
'bart.decoder.decoder_layer.5.output.dense.weight'
:
'decoder.layers.5.fc2.weight'
,
'bart.decoder.decoder_layer.5.output.dense.bias'
:
'decoder.layers.5.fc2.bias'
,
'bart.decoder.decoder_layer.5.output.LayerNorm.weight'
:
'decoder.layers.5.final_layer_norm.weight'
,
'bart.decoder.decoder_layer.5.output.LayerNorm.bias'
:
'decoder.layers.5.final_layer_norm.bias'
}
model_new
=
{}
for
key
,
value
in
map
.
items
():
model_new
[
value
]
=
torch_weights
[
key
]
torch
.
save
(
model_new
,
'F:/Projects/pretrain_ckpt/bart/[cloudwalk_torch_base]/bert4torch_pytorch_model.bin'
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_bart_fudanNLP.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 将FUDAN(fastnlp)的预训练bart模型转换为bert4torch可用的权重
# 权重地址:https://github.com/fastnlp/CPT
import
torch
state_dict
=
torch
.
load
(
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/pytorch_model.bin'
)
state_dict_new
=
{}
for
k
,
v
in
state_dict
.
items
():
# 主要变更就是默认有514个位置,舍弃前两个位置
if
'embed_positions.weight'
in
k
:
v
=
v
[
2
:]
state_dict_new
[
k
]
=
v
else
:
state_dict_new
[
k
]
=
v
torch
.
save
(
state_dict_new
,
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
)
'''config配置
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 21128
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_bert-base-chinese.py
0 → 100644
View file @
0e29b9b7
# 转换huggingface上bert-base-chinese权重
# 权重链接:https://huggingface.co/bert-base-chinese
# 由于key和框架的key没有完全对齐,主要里面用的都是Laynorm.gamma和Laynorm.beta来保存权重和偏置
# 也可使用transformer自带命令转换tf权重https://github.com/google-research/bert
# 转换命令https://huggingface.co/docs/transformers/converting_tensorflow_models
import
torch
state_dict
=
torch
.
load
(
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/pytorch_model.bin'
)
state_dict_new
=
{}
for
k
,
v
in
state_dict
.
items
():
if
'LayerNorm.gamma'
in
k
:
k
=
k
.
replace
(
'LayerNorm.gamma'
,
'LayerNorm.weight'
)
state_dict_new
[
k
]
=
v
elif
'LayerNorm.beta'
in
k
:
k
=
k
.
replace
(
'LayerNorm.beta'
,
'LayerNorm.bias'
)
state_dict_new
[
k
]
=
v
else
:
state_dict_new
[
k
]
=
v
torch
.
save
(
state_dict_new
,
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin'
)
# config配置
'''
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 21128
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_gpt2__cmp_lm_2.6b.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 将清华开源的中文GPT2模型(26亿参数)
# 项目链接(tf版本):https://github.com/TsinghuaAI/CPM-Generate
# pytorch版权重下载链接:https://huggingface.co/TsinghuaAI/CPM-Generate,经过本脚本转成bert4torch适用的权重
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
f
'
{
ckpt_dir
}
/bert4torch_pytorch_model.bin'
num_hidden_layers
=
32
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt2'
w
=
torch_weights
[
'transformer.wte.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'transformer.wpe.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'transformer.h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'transformer.h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'transformer.h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'transformer.h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
# layernorm_final
w
=
torch_weights
[
'transformer.ln_f.weight'
]
new_weights
[
f
'
{
prefix
}
.LayerNormFinal.weight'
]
=
w
b
=
torch_weights
[
'transformer.ln_f.bias'
]
new_weights
[
f
'
{
prefix
}
.LayerNormFinal.bias'
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"vocab_size": 30000,
"hidden_size": 2560,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 10240,
"max_position_embeddings": 1024,
"num_attention_heads": 32,
"num_hidden_layers": 32
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_gpt2__gpt2-ml.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# gpt2-ml
# 项目链接(tf版本):https://github.com/imcaspar/gpt2-ml
# pytorch权重转换和下载:https://github.com/ghosthamlet/gpt2-ml-torch
# 最后经过本脚本转成bert4torch适用的权重
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
f
'
{
ckpt_dir
}
/bert4torch_pytorch_model.bin'
num_hidden_layers
=
48
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt2_ml'
w
=
torch_weights
[
'wte.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'wpe.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
# embedding layernorm
w
=
torch_weights
[
'emb_norm.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.LayerNorm.weight'
]
=
w
b
=
torch_weights
[
'emb_norm.bias'
]
new_weights
[
f
'
{
prefix
}
.embeddings.LayerNorm.bias'
]
=
b
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_gpt__CDial-GPT-LCCC.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 将CDial-GPT的pytorch权重转换为bert4torch可适配的权重,base和large都可转换
# 项目链接(torch版本):https://github.com/thu-coai/CDial-GPT
import
torch
ckpt_dir
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base'
ckpt_file
=
f
'
{
ckpt_dir
}
/pytorch_model.bin'
output_ckpt_file
=
'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
num_hidden_layers
=
12
def
convert
():
torch_weights
=
torch
.
load
(
ckpt_file
)
new_weights
=
{}
prefix
=
'gpt'
# CDial-GPT的[CLS]是0、[PAD]是1,不符合一般习惯,所以交换一下
w
=
torch_weights
[
'transformer.tokens_embed.weight'
]
w
=
torch
.
cat
([
w
[
1
:
2
],
w
[:
1
],
w
[
2
:]],
axis
=
0
)
new_weights
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
=
w
w
=
torch_weights
[
'transformer.positions_embed.weight'
]
new_weights
[
f
'
{
prefix
}
.embeddings.position_embeddings.weight'
]
=
w
qkv
=
[
'query'
,
'key'
,
'value'
]
for
i
in
range
(
num_hidden_layers
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
# q, k, v
w
=
torch_weights
[
'transformer.h.%s.attn.c_attn.weight'
%
i
]
ws
=
torch
.
chunk
(
w
,
3
,
dim
=
1
)
for
k
,
w
in
zip
(
qkv
,
ws
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_attn.bias'
%
i
]
bs
=
torch
.
chunk
(
b
,
3
,
dim
=
0
)
for
k
,
b
in
zip
(
qkv
,
bs
):
name
=
prefix_i
+
f
'attention.self.
{
k
}
.bias'
new_weights
[
name
]
=
b
# hdsz-hdsz的全连接
w
=
torch_weights
[
'transformer.h.%s.attn.c_proj.weight'
%
i
]
name
=
prefix_i
+
'attention.output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.attn.c_proj.bias'
%
i
]
name
=
prefix_i
+
'attention.output.dense.bias'
new_weights
[
name
]
=
b
# layernorm1
w
=
torch_weights
[
'transformer.h.%s.ln_1.weight'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_1.bias'
%
i
]
name
=
prefix_i
+
'attention.output.LayerNorm.bias'
new_weights
[
name
]
=
b
# feed forward 第一层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.weight'
%
i
]
name
=
prefix_i
+
'intermediate.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_fc.bias'
%
i
]
name
=
prefix_i
+
'intermediate.dense.bias'
new_weights
[
name
]
=
b
# feed forward 第二层
w
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.weight'
%
i
]
name
=
prefix_i
+
'output.dense.weight'
new_weights
[
name
]
=
w
.
T
b
=
torch_weights
[
'transformer.h.%s.mlp.c_proj.bias'
%
i
]
name
=
prefix_i
+
'output.dense.bias'
new_weights
[
name
]
=
b
# layernorm2
w
=
torch_weights
[
'transformer.h.%s.ln_2.weight'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.weight'
new_weights
[
name
]
=
w
b
=
torch_weights
[
'transformer.h.%s.ln_2.bias'
%
i
]
name
=
prefix_i
+
'output.LayerNorm.bias'
new_weights
[
name
]
=
b
torch
.
save
(
new_weights
,
output_ckpt_file
)
if
__name__
==
'__main__'
:
convert
()
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 513,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 13088,
"type_vocab_size": 3,
"shared_segment_embeddings": true
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_nezha_gpt_dialog.py
0 → 100644
View file @
0e29b9b7
# NEZHA模型做闲聊任务,苏神已经finetune好的权重,注意不是预训练模型
# 源项目:https://github.com/bojone/nezha_gpt_dialog
import
torch
import
tensorflow
as
tf
tf_path
=
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/model.ckpt'
torch_state_dict
=
{}
prefix
=
'bert'
mapping
=
{
'bert/embeddings/word_embeddings'
:
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
,
'bert/embeddings/token_type_embeddings'
:
f
'
{
prefix
}
.embeddings.token_type_embeddings.weight'
,
'bert/embeddings/LayerNorm/beta'
:
f
'
{
prefix
}
.embeddings.LayerNorm.bias'
,
'bert/embeddings/LayerNorm/gamma'
:
f
'
{
prefix
}
.embeddings.LayerNorm.weight'
,
'cls/predictions/transform/dense/kernel'
:
'cls.predictions.transform.dense.weight##'
,
'cls/predictions/transform/dense/bias'
:
'cls.predictions.transform.dense.bias'
,
'cls/predictions/transform/LayerNorm/beta'
:
'cls.predictions.transform.LayerNorm.bias'
,
'cls/predictions/transform/LayerNorm/gamma'
:
'cls.predictions.transform.LayerNorm.weight'
,
'cls/predictions/output_bias'
:
'cls.predictions.bias'
}
for
i
in
range
(
12
):
prefix_i
=
f
'
{
prefix
}
.encoder.layer.%d.'
%
i
mapping
.
update
({
f
'bert/encoder/layer_
{
i
}
/attention/self/query/kernel'
:
prefix_i
+
'attention.self.query.weight##'
,
# 转置标识
f
'bert/encoder/layer_
{
i
}
/attention/self/query/bias'
:
prefix_i
+
'attention.self.query.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/key/kernel'
:
prefix_i
+
'attention.self.key.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/key/bias'
:
prefix_i
+
'attention.self.key.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/value/kernel'
:
prefix_i
+
'attention.self.value.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/self/value/bias'
:
prefix_i
+
'attention.self.value.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/dense/kernel'
:
prefix_i
+
'attention.output.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/dense/bias'
:
prefix_i
+
'attention.output.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/LayerNorm/beta'
:
prefix_i
+
'attention.output.LayerNorm.bias'
,
f
'bert/encoder/layer_
{
i
}
/attention/output/LayerNorm/gamma'
:
prefix_i
+
'attention.output.LayerNorm.weight'
,
f
'bert/encoder/layer_
{
i
}
/intermediate/dense/kernel'
:
prefix_i
+
'intermediate.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/intermediate/dense/bias'
:
prefix_i
+
'intermediate.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/dense/kernel'
:
prefix_i
+
'output.dense.weight##'
,
f
'bert/encoder/layer_
{
i
}
/output/dense/bias'
:
prefix_i
+
'output.dense.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/LayerNorm/beta'
:
prefix_i
+
'output.LayerNorm.bias'
,
f
'bert/encoder/layer_
{
i
}
/output/LayerNorm/gamma'
:
prefix_i
+
'output.LayerNorm.weight'
})
for
key
,
value
in
mapping
.
items
():
ts
=
tf
.
train
.
load_variable
(
tf_path
,
key
)
if
value
.
endswith
(
'##'
):
value
=
value
.
replace
(
'##'
,
''
)
torch_state_dict
[
value
]
=
torch
.
from_numpy
(
ts
).
T
else
:
torch_state_dict
[
value
]
=
torch
.
from_numpy
(
ts
)
torch_state_dict
[
'cls.predictions.decoder.weight'
]
=
torch_state_dict
[
f
'
{
prefix
}
.embeddings.word_embeddings.weight'
]
torch_state_dict
[
'cls.predictions.decoder.bias'
]
=
torch_state_dict
[
'cls.predictions.bias'
]
torch
.
save
(
torch_state_dict
,
'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin'
)
# config文件
'''
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"max_relative_position": 64,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 14195,
"use_relative_position": true
}
'''
bert/bert4torch_cmcc/examples/convert_script/convert_roberta_chess.py
0 → 100644
View file @
0e29b9b7
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 只是转换苏神已经train好的模型,注意不是预训练模型
import
numpy
as
np
import
h5py
import
torch
# 这里用的keras==2.3.1
from
keras.engine
import
saving
tf_path
=
'E:/Github/bert4keras/examples/best_model_chess.weights'
torch_state_dict
=
{}
# 1表示transpose, 0表示不变
key_map
=
{
'Embedding-Token/embeddings:0'
:
[
'embeddings.word_embeddings.weight'
,
0
],
'Embedding-Segment/embeddings:0'
:
[
'embeddings.segment_embeddings.weight'
,
0
],
'Embedding-Position/embeddings:0'
:
[
'embeddings.position_embeddings.weight'
,
0
],
'Embedding-Norm/gamma:0'
:
[
'embeddings.layerNorm.weight'
,
0
],
'Embedding-Norm/beta:0'
:
[
'embeddings.layerNorm.bias'
,
0
],
'MLM-Dense/kernel:0'
:
[
'mlmDense.weight'
,
1
],
'MLM-Dense/bias:0'
:
[
'mlmDense.bias'
,
0
],
'MLM-Norm/gamma:0'
:
[
'mlmLayerNorm.weight'
,
0
],
'MLM-Norm/beta:0'
:
[
'mlmLayerNorm.bias'
,
0
],
'MLM-Bias/bias:0'
:
[
'mlmBias'
,
0
],
}
for
i
in
range
(
12
):
key_map
.
update
({
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
1
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
1
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
2
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
2
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
3
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
3
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
4
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.weight'
,
1
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention/dense_
{
i
*
6
+
4
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.bias'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention-Norm/gamma:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm1.weight'
,
0
],
f
'Transformer-
{
i
}
-MultiHeadSelfAttention-Norm/beta:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm1.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
5
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.weight'
,
1
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
5
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
6
}
/kernel:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.outputDense.weight'
,
1
],
f
'Transformer-
{
i
}
-FeedForward/dense_
{
i
*
6
+
6
}
/bias:0'
:
[
f
'encoderLayer.
{
i
}
.feedForward.outputDense.bias'
,
0
],
f
'Transformer-
{
i
}
-FeedForward-Norm/gamma:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm2.weight'
,
0
],
f
'Transformer-
{
i
}
-FeedForward-Norm/beta:0'
:
[
f
'encoderLayer.
{
i
}
.layerNorm2.bias'
,
0
],
})
consume_keys
=
set
()
with
h5py
.
File
(
tf_path
,
mode
=
'r'
)
as
f
:
if
'layer_names'
not
in
f
.
attrs
and
'model_weights'
in
f
:
f
=
f
[
'model_weights'
]
layer_names
=
saving
.
load_attributes_from_hdf5_group
(
f
,
'layer_names'
)
weight_value_tuples
=
[]
for
k
,
name
in
enumerate
(
layer_names
):
g
=
f
[
name
]
weight_names
=
saving
.
load_attributes_from_hdf5_group
(
g
,
'weight_names'
)
weight_values
=
[
np
.
asarray
(
g
[
weight_name
])
for
weight_name
in
weight_names
]
for
i
,
weight_name
in
enumerate
(
weight_names
):
new_key
=
key_map
[
weight_name
][
0
]
if
key_map
[
weight_name
][
1
]
==
1
:
# transpose
torch_state_dict
[
new_key
]
=
torch
.
from_numpy
(
weight_values
[
i
]).
T
else
:
torch_state_dict
[
new_key
]
=
torch
.
from_numpy
(
weight_values
[
i
])
assert
new_key
not
in
consume_keys
,
'duplicate keys'
consume_keys
.
add
(
new_key
)
if
hasattr
(
f
,
'close'
):
f
.
close
()
elif
hasattr
(
f
.
file
,
'close'
):
f
.
file
.
close
()
torch_state_dict
[
'mlmDecoder.weight'
]
=
torch_state_dict
[
'embeddings.word_embeddings.weight'
]
torch_state_dict
[
'mlmDecoder.bias'
]
=
torch_state_dict
[
'mlmBias'
]
# for k, v in torch_state_dict.items():
# print(k, v.shape)
torch
.
save
(
torch_state_dict
,
'E:/Github/bert4torch/examples/others/best_model_chess.pt'
)
bert/bert4torch_cmcc/examples/convert_script/convert_t5_pegasus.py
0 → 100644
View file @
0e29b9b7
# t5_pegasus从tf转为bert4torch适配的pytorch版本
# 权重链接:https://github.com/ZhuiyiTechnology/t5-pegasus
import
torch
import
tensorflow
as
tf
import
json
# small
tf_dir
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_small]--chinese_t5_pegasus_small/'
torch_path
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/pytorch_model.bin'
# base:
# tf_dir = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_tf_base]--chinese_t5_pegasus_base/'
# torch_path = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/pytorch_model.bin'
tf_path
=
tf_dir
+
'model.ckpt'
with
open
(
tf_dir
+
'config.json'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
config
=
json
.
load
(
f
)
num_layers
=
config
[
'num_hidden_layers'
]
torch_state_dict
=
{}
mapping
=
{
'shared/embedding'
:
'shared.weight'
,
'encoder/block_000/layer_000/SelfAttention/relative_attention_bias'
:
'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T'
,
# 自定义标记,##T结尾表示要转置
'encoder/rms_norm/scale'
:
'encoder.final_layer_norm.weight'
,
'decoder/block_000/layer_000/SelfAttention/relative_attention_bias'
:
'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T'
,
'decoder/rms_norm/scale'
:
'decoder.final_layer_norm.weight'
,
'decoder/logits/kernel'
:
'lm_head.weight##T'
}
for
i
in
range
(
num_layers
):
i1
=
str
(
i
).
rjust
(
3
,
'0'
)
mapping
.
update
({
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/q'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.q.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/k'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.k.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/v'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.v.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/SelfAttention/o'
:
f
'encoder.block.
{
i
}
.layer.0.SelfAttention.o.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_000/rms_norm/scale'
:
f
'encoder.block.
{
i
}
.layer.0.layer_norm.weight'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wi_0/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wi_0.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wi_1/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wi_1.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/DenseReluDense/wo/kernel'
:
f
'encoder.block.
{
i
}
.layer.1.DenseReluDense.wo.weight##T'
,
f
'encoder/block_
{
i1
}
/layer_001/rms_norm/scale'
:
f
'encoder.block.
{
i
}
.layer.1.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/q'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.q.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/k'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.k.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/v'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.v.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/SelfAttention/o'
:
f
'decoder.block.
{
i
}
.layer.0.SelfAttention.o.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_000/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.0.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/q'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.q.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/k'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.k.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/v'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.v.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/EncDecAttention/o'
:
f
'decoder.block.
{
i
}
.layer.1.EncDecAttention.o.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_001/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.1.layer_norm.weight'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wi_0/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wi_0.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wi_1/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wi_1.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/DenseReluDense/wo/kernel'
:
f
'decoder.block.
{
i
}
.layer.2.DenseReluDense.wo.weight##T'
,
f
'decoder/block_
{
i1
}
/layer_002/rms_norm/scale'
:
f
'decoder.block.
{
i
}
.layer.2.layer_norm.weight'
,
})
transpose_layers
=
[
''
]
for
k
,
v
in
mapping
.
items
():
ts
=
torch
.
from_numpy
(
tf
.
train
.
load_variable
(
tf_path
,
k
))
# if len(ts.shape)==2 and ts.shape[0] == ts.shape[1]:
# print(k, v)
if
v
.
endswith
(
'##T'
):
torch_state_dict
[
v
.
rstrip
(
'##T'
)]
=
ts
.
T
else
:
torch_state_dict
[
v
]
=
ts
torch
.
save
(
torch_state_dict
,
torch_path
)
# config文件
'''
# base版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 2048,
"num_attention_heads": 12,
"attention_head_size": 64,
"num_hidden_layers": 12,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
# small版本
{
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"num_attention_heads": 6,
"attention_head_size": 64,
"num_hidden_layers": 8,
"vocab_size": 50000,
"relative_attention_num_buckets": 32,
"attention_scale": false,
"is_dropout": true
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/convert_script/convert_transformer_xl.py
0 → 100644
View file @
0e29b9b7
# 权重链接:https://huggingface.co/transfo-xl-wt103
# 该项目是英文的:只用于bert4torch中transformer_xl的调试模型结构,并未实际用于finetune
import
torch
ckpt_file
=
'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/pytorch_model.bin'
torch_state_dict
=
{}
# 1表示transpose, 0表示不变
key_map
=
{
'transformer.word_emb.emb_layers.0.weight'
:
'embeddings.emb_layers.0.weight'
,
'transformer.word_emb.emb_layers.1.weight'
:
'embeddings.emb_layers.1.weight'
,
'transformer.word_emb.emb_layers.2.weight'
:
'embeddings.emb_layers.2.weight'
,
'transformer.word_emb.emb_layers.3.weight'
:
'embeddings.emb_layers.3.weight'
,
'transformer.word_emb.emb_projs.0'
:
'embeddings.emb_projs.0'
,
'transformer.word_emb.emb_projs.1'
:
'embeddings.emb_projs.1'
,
'transformer.word_emb.emb_projs.2'
:
'embeddings.emb_projs.2'
,
'transformer.word_emb.emb_projs.3'
:
'embeddings.emb_projs.3'
,
}
for
i
in
range
(
18
):
key_map
.
update
({
f
'transformer.layers.
{
i
}
.dec_attn.r_r_bias'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r_r_bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.r_w_bias'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r_w_bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.o_net.weight'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.o.weight'
,
f
'transformer.layers.
{
i
}
.dec_attn.layer_norm.weight'
:
f
'encoderLayer.
{
i
}
.layerNorm1.weight'
,
f
'transformer.layers.
{
i
}
.dec_attn.layer_norm.bias'
:
f
'encoderLayer.
{
i
}
.layerNorm1.bias'
,
f
'transformer.layers.
{
i
}
.dec_attn.r_net.weight'
:
f
'encoderLayer.
{
i
}
.multiHeadAttention.r.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.0.weight'
:
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.0.bias'
:
f
'encoderLayer.
{
i
}
.feedForward.intermediateDense.bias'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.3.weight'
:
f
'encoderLayer.
{
i
}
.feedForward.outputDense.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.CoreNet.3.bias'
:
f
'encoderLayer.
{
i
}
.feedForward.outputDense.bias'
,
f
'transformer.layers.
{
i
}
.pos_ff.layer_norm.weight'
:
f
'encoderLayer.
{
i
}
.layerNorm2.weight'
,
f
'transformer.layers.
{
i
}
.pos_ff.layer_norm.bias'
:
f
'encoderLayer.
{
i
}
.layerNorm2.bias'
,
})
torch_weights
=
torch
.
load
(
ckpt_file
)
model_new
=
{}
for
key
,
value
in
key_map
.
items
():
model_new
[
value
]
=
torch_weights
[
key
]
for
i
in
range
(
18
):
qkv_net
=
torch_weights
[
f
'transformer.layers.
{
i
}
.dec_attn.qkv_net.weight'
]
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.q.weight'
],
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.k.weight'
],
model_new
[
f
'encoderLayer.
{
i
}
.multiHeadAttention.v.weight'
]
=
qkv_net
.
chunk
(
3
,
dim
=
0
)
torch
.
save
(
model_new
,
'F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103/bert4torch_pytorch_model.bin'
)
# config文件
'''
{
"adaptive": true,
"architectures": [
"TransfoXLLMHeadModel"
],
"attn_type": 0,
"clamp_len": 1000,
"cutoffs": [
20000,
40000,
200000
],
"d_embed": 1024,
"d_head": 64,
"intermediate_size": 4096,
"hidden_size": 1024,
"div_val": 4,
"is_dropout": true,
"adaptive_embedding": true,
"attention_probs_dropout_prob": 0.0,
"hidden_dropout_prob": 0.1,
"hidden_act": "relu",
"eos_token_id": 0,
"ext_len": 0,
"init": "normal",
"init_range": 0.01,
"init_std": 0.02,
"layer_norm_epsilon": 1e-05,
"mem_len": 1600,
"model_type": "transfo-xl",
"num_attention_heads": 16,
"num_hidden_layers": 18,
"pre_lnorm": false,
"proj_init_std": 0.01,
"same_length": true,
"sample_softmax": -1,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 250
}
},
"tgt_len": 128,
"tie_projs": [
false,
true,
true,
true
],
"tie_weight": true,
"untie_r": true,
"vocab_size": 267735
}
'''
\ No newline at end of file
bert/bert4torch_cmcc/examples/others/task_conditional_language_model.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# bert做conditional language model任务
# 按类随机生成文本,这个demo的类别是情感极性(正/负)
# 请参考:https://kexue.fm/archives/7124
from
pydantic
import
NoneStrBytes
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
,
Callback
,
AutoRegressiveDecoder
,
ListDataset
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
import
torch.optim
as
optim
import
torch.nn
as
nn
# 模型配置
maxlen
=
128
batch_size
=
16
num_classes
=
2
epochs
=
20
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
# if len(D) >= 100:
# break
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
(
label
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
,
batch_labels
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
,
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
,
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
c
=
nn
.
Embedding
(
num_classes
,
128
)
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'lm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
layer_norm_cond
=
c
,
ignore_invalid_weights
=
True
)
# 忽略未初始化的权重
def
forward
(
self
,
inputs
):
_
,
seq_output
=
self
.
bert
(
inputs
)
# [btz, seq_len, vocab_size]
return
seq_output
model
=
Model
().
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
input
,
target
):
input
=
input
[:,
:
-
1
,
:].
reshape
(
-
1
,
input
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
input
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
RandomSentiment
(
AutoRegressiveDecoder
):
"""根据情感标签(0:负,1:正)随机生成一批句子
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
output_ids
segment_ids
=
torch
.
zeros_like
(
token_ids
,
device
=
device
)
label
=
inputs
[
0
]
return
model
.
predict
([
token_ids
,
segment_ids
,
label
])[:,
-
1
,
:]
def
generate
(
self
,
label
,
n
=
1
,
topp
=
0.95
):
results
=
self
.
random_sample
([[
label
]],
n
,
topp
=
topp
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
random_sentiment
=
RandomSentiment
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
just_show
():
print
(
u
'正面采样:'
)
print
(
random_sentiment
.
generate
(
1
,
5
,
0.95
),
'
\n
'
)
print
(
u
'负面采样:'
)
print
(
random_sentiment
.
generate
(
0
,
5
,
0.95
),
'
\n
'
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
"""
正面采样:
[
u'外观时尚、漂亮、性价比高。',
u'外观漂亮,配置均衡,比较满意,性价比高,外观漂亮,性能较高。',
u'我是在大学的时候看到这本书的,所以一直在买。书中的作者是林静蕾,她用自己的口吻写出了一个孩子成长中的心路历程,让我看到了她们成长中的不同之处,以及她们成长过程中的不同境界。让我很欣赏!',
u'我想这是一本能够告诉读者什么是坏的,而不是教你怎样说话,告诉我什么是错。这里我推荐了《我要讲故事》,这本书是我很喜欢的一本书,我认为它的理由很多,但是,我相信我。如果你从中得到一些改进,或者你已经有了一个明智的决定。',
u'我们一家五口住的是标间,大床房,大床的床很舒服;而我们在携程网上订了两套大床房,这个酒店的价格还是比较合理的;但是房间的隔音效果不太理想,有点响的声音;酒店门口的地铁在施工中,不方便;但是酒店的门口的出租车不知道是哪个车的,打车不是很方便;酒店外面的停'
]
负面采样:
[
u'不知道是不是因为电池不太好,不是我不喜欢。',
u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
u'1、外壳不容易沾手印,不容易洗洗2、屏幕有点旧, 不能下载铃声',
u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款,为什么订单下了两周多至今还未到货?是收货时间太快了,可能就这么过去了吧?',
u'这本书我是在网上先看了一遍,后来我再看了一遍。感觉作者的文笔实在太烂了,特别是在写他的博客时特别别扭,写得很不专业,特别是他写股票时那个情绪调节的小男孩,简直就是自作聪明的样子,简直就是自作聪明的一种表现!'
]
"""
bert/bert4torch_cmcc/examples/others/task_iflytek_bert_of_theseus.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 文本分类例子下的模型压缩
# 方法为BERT-of-Theseus
# 论文:https://arxiv.org/abs/2002.02925
# 博客:https://kexue.fm/archives/7575
import
json
from
bert4torch.models
import
build_transformer_model
,
BaseModel
,
BERT
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.layers
import
BertLayer
import
torch
from
torch.utils.data
import
DataLoader
,
Dataset
import
torch.nn
as
nn
import
torch.optim
as
optim
from
tqdm
import
tqdm
from
torchinfo
import
summary
import
copy
from
torch.distributions.bernoulli
import
Bernoulli
num_classes
=
119
maxlen
=
128
batch_size
=
32
replacing_rate
=
0.5
steps_for_replacing
=
2000
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式: (文本, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
i
,
l
in
enumerate
(
f
):
l
=
json
.
loads
(
l
)
text
,
label
=
l
[
'sentence'
],
l
[
'label'
]
D
.
append
((
text
,
int
(
label
)))
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/CLUEdataset/iflytek/dev.json'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
class
BERT_THESEUS
(
BERT
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
layer
=
BertLayer
(
self
.
hidden_size
,
self
.
num_attention_heads
,
self
.
dropout_rate
,
self
.
attention_probs_dropout_prob
,
self
.
intermediate_size
,
self
.
hidden_act
,
is_dropout
=
False
,
conditional_size
=
self
.
conditional_size
)
self
.
encoderLayer
=
nn
.
ModuleList
(
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
self
.
num_hidden_layers
)]))
self
.
scc_n_layer
=
6
# 蒸馏到6层
self
.
scc_layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
self
.
scc_n_layer
)])
self
.
compress_ratio
=
self
.
num_hidden_layers
//
self
.
scc_n_layer
self
.
bernoulli
=
None
def
set_replacing_rate
(
self
,
replacing_rate
):
if
not
0
<
replacing_rate
<=
1
:
raise
Exception
(
'Replace rate must be in the range (0, 1]!'
)
self
.
bernoulli
=
Bernoulli
(
torch
.
tensor
([
replacing_rate
]))
def
apply_main_layers
(
self
,
inputs
):
"""BERT的主体是基于Self-Attention的模块
顺序:Att --> Add --> LN --> FFN --> Add --> LN
"""
hidden_states
,
attention_mask
,
conditional_emb
=
inputs
encoded_layers
=
[
hidden_states
]
# 添加embedding的输出
if
self
.
training
:
inference_layers
=
[]
for
i
in
range
(
self
.
scc_n_layer
):
if
self
.
bernoulli
.
sample
()
==
1
:
# REPLACE
inference_layers
.
append
(
self
.
scc_layer
[
i
])
else
:
# KEEP the original
for
offset
in
range
(
self
.
compress_ratio
):
inference_layers
.
append
(
self
.
encoderLayer
[
i
*
self
.
compress_ratio
+
offset
])
else
:
# inference with compressed model
inference_layers
=
self
.
scc_layer
# forward
for
i
,
layer_module
in
enumerate
(
inference_layers
):
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
,
conditional_emb
)
if
self
.
output_all_encoded_layers
:
encoded_layers
.
append
(
hidden_states
)
if
not
self
.
output_all_encoded_layers
:
encoded_layers
.
append
(
hidden_states
)
return
[
encoded_layers
,
conditional_emb
]
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
BERT_THESEUS
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
num_classes
)
def
forward
(
self
,
token_ids
,
segment_ids
):
encoded_layers
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dense
(
encoded_layers
[:,
0
,
:])
# 取第1个位置
return
output
model
=
Model
().
to
(
device
)
summary
(
model
,
input_data
=
next
(
iter
(
train_dataloader
))[
0
])
# replacing策略
class
ConstantReplacementScheduler
:
def
__init__
(
self
,
bert_encoder
,
replacing_rate
,
replacing_steps
=
None
):
self
.
bert_encoder
=
bert_encoder
self
.
replacing_rate
=
replacing_rate
self
.
replacing_steps
=
replacing_steps
self
.
step_counter
=
0
self
.
bert_encoder
.
set_replacing_rate
(
replacing_rate
)
def
step
(
self
):
self
.
step_counter
+=
1
if
self
.
replacing_steps
is
None
or
self
.
replacing_rate
==
1.0
:
return
self
.
replacing_rate
else
:
if
self
.
step_counter
>=
self
.
replacing_steps
:
self
.
bert_encoder
.
set_replacing_rate
(
1.0
)
self
.
replacing_rate
=
1.0
return
self
.
replacing_rate
class
LinearReplacementScheduler
:
def
__init__
(
self
,
bert_encoder
,
base_replacing_rate
,
k
):
self
.
bert_encoder
=
bert_encoder
self
.
base_replacing_rate
=
base_replacing_rate
self
.
step_counter
=
0
self
.
k
=
k
self
.
bert_encoder
.
set_replacing_rate
(
base_replacing_rate
)
def
step
(
self
):
self
.
step_counter
+=
1
current_replacing_rate
=
min
(
self
.
k
*
self
.
step_counter
+
self
.
base_replacing_rate
,
1.0
)
self
.
bert_encoder
.
set_replacing_rate
(
current_replacing_rate
)
return
current_replacing_rate
replacing_rate_scheduler
=
ConstantReplacementScheduler
(
bert_encoder
=
model
.
bert
,
replacing_rate
=
replacing_rate
,
replacing_steps
=
steps_for_replacing
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
scheduler
=
replacing_rate_scheduler
,
metrics
=
[
'accuracy'
])
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
u
'val_acc: %.5f, best_val_acc: %.5f
\n
'
%
(
val_acc
,
self
.
best_val_acc
))
def
predict_to_file
(
in_file
,
out_file
):
"""输出预测结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw
=
open
(
out_file
,
'w'
)
with
open
(
in_file
)
as
fr
:
for
l
in
tqdm
(
fr
):
l
=
json
.
loads
(
l
)
text
=
l
[
'sentence'
]
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
label
=
model
.
predict
([[
token_ids
],
[
segment_ids
]])[
0
].
argmax
()
l
=
json
.
dumps
({
'id'
:
str
(
l
[
'id'
]),
'label'
:
str
(
label
)})
fw
.
write
(
l
+
'
\n
'
)
fw
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
# predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
bert/bert4torch_cmcc/examples/others/task_language_model.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# bert做language model任务,小说生成
import
glob
,
re
from
tqdm
import
tqdm
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
AutoRegressiveDecoder
,
Callback
,
ListDataset
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
maxlen
=
256
batch_size
=
8
epochs
=
10000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
novels
=
[]
for
txt
in
glob
.
glob
(
filenames
):
txt
=
open
(
txt
,
encoding
=
'utf-8'
).
read
()
txt
=
txt
.
replace
(
'
\r
'
,
''
).
replace
(
'
\n
'
,
''
)
txt
=
txt
.
replace
(
u
'整理制作,并提供下载'
,
''
)
txt
=
re
.
sub
(
u
'www.*?com'
,
''
,
txt
)
txt
=
txt
.
replace
(
u
'
\u3000
'
,
' '
)
sents
=
[]
for
t
in
txt
.
split
(
' '
):
for
s
in
re
.
findall
(
u
'.*?。'
,
t
):
if
len
(
s
)
<=
maxlen
-
2
:
sents
.
append
(
s
)
novels
.
append
(
sents
)
data
=
[]
pbar
=
tqdm
(
desc
=
u
'构建语料中'
,
total
=
sum
(
len
(
n
)
for
n
in
novels
))
for
novel
in
novels
:
s
=
u
''
for
i
in
range
(
len
(
novel
)):
for
j
in
range
(
len
(
novel
)
-
i
):
if
len
(
s
)
+
len
(
novel
[
i
+
j
])
>
maxlen
-
2
:
data
.
append
(
s
)
s
=
u
''
break
else
:
s
+=
novel
[
i
+
j
]
pbar
.
update
(
1
)
if
i
+
j
>=
len
(
novel
):
break
if
s
:
data
.
append
(
s
)
pbar
.
close
()
return
data
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
text
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/pretrain/金庸小说/*.txt'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建模
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'lm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
_
,
mlm_scores
=
outputs
mlm_scores
=
mlm_scores
[:,
:
-
1
,
:].
reshape
(
-
1
,
mlm_scores
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
mlm_scores
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
# 随机采样
class
StoryCompletion
(
AutoRegressiveDecoder
):
"""基于随机采样的故事续写
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
=
inputs
[
0
]
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
zeros_like
(
token_ids
,
device
=
device
)
_
,
mlm_scores
=
model
.
predict
([
token_ids
,
segment_ids
])
return
mlm_scores
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topp
=
0.95
):
token_ids
,
_
=
tokenizer
.
encode
(
text
)
results
=
self
.
random_sample
([
token_ids
[:
-
1
]],
n
,
topp
=
topp
)
# 基于随机采样
return
[
text
+
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
results
]
story_completion
=
StoryCompletion
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
just_show
():
s1
=
u
'当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。'
s2
=
u
'虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。'
s3
=
u
'杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。'
for
s
in
[
s1
,
s2
,
s3
]:
t
=
story_completion
.
generate
(
s
)
print
(
u
'输入: %s'
%
s
)
print
(
u
'结果: %s
\n
'
%
(
'
\n
'
.
join
(
t
)))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
100
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.weights'
)
"""
效果:
输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。
结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上,越想越是担心,走到赵敏窗外,但听她呼吸调匀,正自香梦沉酣。次日清晨,张无忌便和赵敏去买了一匹高头大马,自己骑了随伴。那马甚有神骏,三十六斤重的身躯之中,竟无一头白马。他心中怦怦乱跳,暗想:若能将赵敏引出迷城,我决不致再和她相会,但若和赵姑娘相遇,我一生一世决计再难相见。何况我是她的私生女儿,这般亲热,岂不是好?我如何能和她相见?今后我要教训教训她才好?我教教她,教训她,要她心里快快活活的。他心如刀割,当即回到客店,将张无忌的所在说了。
输入: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。
结果: 虚竹飞身跃上松树的枝干,只见段延庆的钢杖深深嵌在树枝之中,全凭一股内力粘劲,挂住了下面四人,内力之深厚,实是非同小可。虚竹伸左手抓住钢杖,提将上来。那矮子见他如此功力,大吃一惊,叫道:什么人?是谁?你干什么?我师父是谁?你们是谁?是谁?你们是谁?我师父是谁?你这矮子,便是段延庆。你们不知道我师父便是,是不是?快快说来。那矮子道:我师父便是延庆太子,他的徒弟也是段延庆。他老人家在唐朝做镇南王,你们便将他改名为延庆太子,叫做延庆太子!这名头倒怪,你们大伙儿听见了,也不知道他老人家是死是活。
输入: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。
结果: 杨过居住在侠客岛,是令狐冲的弟子,武器是金蛇剑。这时见他手中所握,竟是一柄特制的短剑,心中大喜,叫道::原来是金蛇郎君的剑!原来你便是金蛇郎君的弟子,这一下可要叫我失望了。那人哈哈一笑,说道:好啊!好啊,好啊!我的金蛇剑是我的,不过我是你的。这人道:我姓杨名过,名字叫过。你是我儿子,是我女儿,是不是?你这么大的年纪,怎地自称金刀驸马?我这就给你取个名字,叫作过儿。
"""
bert/bert4torch_cmcc/examples/others/task_language_model_chinese_chess.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
# 介绍:https://kexue.fm/archives/7877
# 数据:https://github.com/bojone/gpt_cchess
# 模型训练可以在python2/python3进行。但是cchess模块只支持python3,
# 因此如果需要交互式体验模型棋力,那么需要在python3下进行。
# 权重转换脚本见:https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_roberta_chess.py
import
json
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
Callback
from
cchess
import
*
# 基本信息
maxlen
=
512
steps_per_epoch
=
1000
epochs
=
10000
batch_size
=
16
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取全局棋谱
返回:[(棋谱, 结果)],其中结果等于2为红方赢棋,1为和棋,
0为黑方赢棋,-1则为无明确标注胜负。
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
not
l
[
'fen'
]:
result
=
int
(
l
[
'items'
].
get
(
u
'棋局结果'
,
-
1
))
D
.
append
((
l
[
'iccs'
],
result
))
return
D
# 建立分词器
chars
=
[
u
'[PAD]'
]
+
list
(
u
'0123456789abcdefghi'
)
token_dict
=
dict
(
zip
(
chars
,
range
(
len
(
chars
))))
tokenizer
=
Tokenizer
(
token_dict
)
tokenizer
.
_token_unk_id
=
0
bert_token_dict
=
load_vocab
(
dict_path
)
keep_tokens
=
[
bert_token_dict
[
c
]
for
c
in
chars
]
count
=
0
def
get_count
():
if
count
<
20000
:
n
=
8
elif
count
<
40000
:
n
=
4
elif
count
<
80000
:
n
=
2
else
:
n
=
1
return
n
def
collate_fn
(
batch
):
"""数据生成器
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
text
,
_
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
' '
.
join
(
text
),
maxlen
=
maxlen
//
get_count
()
+
1
)
batch_token_ids
.
append
([
0
]
+
token_ids
[
1
:
-
1
])
batch_segment_ids
.
append
([
0
]
+
segment_ids
[
1
:
-
1
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
global
count
count
+=
1
return
[
batch_token_ids
,
batch_segment_ids
],
batch_token_ids
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/qipu/qipu.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 由于字典中0不代表padding位,为避免attention_mask计算错误,这里token_pad_ids=-100
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
application
=
'lm'
,
with_mlm
=
True
,
keep_tokens
=
keep_tokens
,
token_pad_ids
=-
100
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
_
,
mlm_scores
=
outputs
mlm_scores
=
mlm_scores
[:,
:
-
1
,
:].
reshape
(
-
1
,
mlm_scores
.
shape
[
-
1
])
target
=
target
[:,
1
:].
flatten
()
return
super
().
forward
(
mlm_scores
,
target
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
ChessPlayer
(
object
):
"""交互式下棋程序
"""
def
move_to_chinese
(
self
,
move
):
"""将单步走法转为中文描述
"""
if
not
isinstance
(
move
,
Move
):
move
=
Move
(
self
.
board
,
move
[
0
],
move
[
1
])
return
move
.
to_chinese
()
def
move_to_iccs
(
self
,
move
):
"""将单步走法转为iccs表示
"""
if
not
isinstance
(
move
,
Move
):
move
=
Move
(
self
.
board
,
move
[
0
],
move
[
1
])
return
move
.
to_iccs
()
def
print_board
(
self
):
"""打印当前棋盘
直观起见,红方用红色表示,黑方用绿色表示。
"""
for
l
in
self
.
board
.
dump_board
():
for
c
in
u
'兵炮车马相仕帅'
:
l
=
l
.
replace
(
c
,
u
'
\033
[1;31;40m%s
\033
[0m'
%
c
)
for
c
in
u
'卒砲砗碼象士将'
:
l
=
l
.
replace
(
c
,
u
'
\033
[1;32;40m%s
\033
[0m'
%
c
)
print
(
l
)
def
movable_steps
(
self
):
"""给出当前局面所有候选走法
"""
return
[
self
.
move_to_iccs
(
m
)
for
m
in
self
.
board
.
create_moves
()]
def
human_input
(
self
):
"""人类行棋
"""
while
True
:
try
:
iccs
=
input
(
u
'请输入iccs棋着: '
)
print
(
iccs
)
move
=
self
.
board
.
move_iccs
(
iccs
)
if
move
is
not
None
:
return
iccs
,
move
except
KeyboardInterrupt
:
return
None
except
:
pass
def
record
(
self
,
iccs
):
"""将局面往前推进一步
"""
self
.
history
+=
iccs
self
.
board
.
next_turn
()
self
.
print_board
()
self
.
current
=
(
self
.
current
+
1
)
%
2
def
new_game
(
self
,
current
=
0
):
"""开新局
"""
self
.
board
=
ChessBoard
()
self
.
board
.
from_fen
(
FULL_INIT_FEN
)
self
.
print_board
()
self
.
history
=
''
self
.
current
=
current
if
self
.
current
==
0
:
# 人类先手
iccs
,
move
=
self
.
human_input
()
self
.
record
(
iccs
)
while
True
:
# 机器走棋
moves
=
self
.
movable_steps
()
iccses
=
[
' '
.
join
(
self
.
history
+
m
)
for
m
in
moves
]
token_ids
=
[[
0
]
+
tokenizer
.
encode
(
ic
)[
0
][
1
:
-
1
]
for
ic
in
iccses
]
token_ids
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
zeros_like
(
token_ids
)
preds
=
model
.
predict
([
token_ids
,
segment_ids
])[
-
1
][:,
-
5
:
-
1
]
preds
=
nn
.
Softmax
(
dim
=-
1
)(
preds
)
preds
=
torch
.
take_along_dim
(
preds
,
token_ids
[:,
-
4
:,
None
],
dim
=
2
)
preds
=
torch
.
log
(
preds
+
1e-8
)[:,
:,
0
].
sum
(
dim
=
1
)
iccs
=
moves
[
preds
.
argmax
()]
move
=
self
.
board
.
move_iccs
(
iccs
)
self
.
record
(
iccs
)
if
self
.
board
.
is_win
():
print
(
u
'机器赢了'
)
break
# 人类走棋
iccs
,
move
=
self
.
human_input
()
self
.
record
(
iccs
)
if
self
.
board
.
is_win
():
print
(
u
'人类赢了'
)
break
chessplayer
=
ChessPlayer
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存模型
# model.save_weights('./best_model_chess.pt')
pass
if
__name__
==
'__main__'
:
choice
=
'eval'
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
1000
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model_chess.pt'
)
chessplayer
.
new_game
(
0
)
# 启动新棋局,0为人类先手,1为机器先手
bert/bert4torch_cmcc/examples/others/task_nl2sql_baseline.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 追一科技2019年NL2SQL挑战赛的一个Baseline(个人作品,非官方发布,基于Bert)
# 比赛地址:https://tianchi.aliyun.com/competition/entrance/231716/introduction
# 科学空间:https://kexue.fm/archives/6771
# 苏神结果是58%左右,我复现出来58.39%
# 思路:[CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
# 整句的[CLS]用来做conds连接符判断: {0:"", 1:"and", 2:"or"}
# col的[CLS]用来预测该列是否被select+agg聚合判断: {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
''' 单条样本示例
{
"table_id": "a1b2c3d4", # 相应表格的id
"question": "世茂茂悦府新盘容积率大于1,请问它的套均面积是多少?", # 自然语言问句
"sql":{ # 真实SQL
"sel": [7], # SQL选择的列
"agg": [0], # 选择的列相应的聚合函数, '0'代表无
"cond_conn_op": 0, # 条件之间的关系
"conds": [
[1, 2, "世茂茂悦府"], # 条件列, 条件类型, 条件值,col_1 == "世茂茂悦府"
[6, 0, "1"]
]
}
}
'''
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
import
json
import
codecs
import
numpy
as
np
from
tqdm
import
tqdm
import
jieba
import
editdistance
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
import
torch.nn.functional
as
F
from
torch
import
nn
,
optim
import
re
batch_size
=
16
maxlen
=
160
num_agg
=
7
# agg_sql_dict = {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
num_op
=
5
# {0:">", 1:"<", 2:"==", 3:"!=", 4:"不被select"}
num_cond_conn_op
=
3
# conn_sql_dict = {0:"", 1:"and", 2:"or"}
learning_rate
=
2.5e-5
epochs
=
15
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
def
read_data
(
data_file
,
table_file
):
data
,
tables
=
[],
{}
with
open
(
data_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
data
.
append
(
json
.
loads
(
l
))
with
open
(
table_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
d
=
{}
d
[
'headers'
]
=
l
[
'header'
]
d
[
'header2id'
]
=
{
j
:
i
for
i
,
j
in
enumerate
(
d
[
'headers'
])}
d
[
'content'
]
=
{}
d
[
'all_values'
]
=
set
()
rows
=
np
.
array
(
l
[
'rows'
])
for
i
,
h
in
enumerate
(
d
[
'headers'
]):
d
[
'content'
][
h
]
=
set
(
rows
[:,
i
])
d
[
'all_values'
].
update
(
d
[
'content'
][
h
])
d
[
'all_values'
]
=
set
([
i
for
i
in
d
[
'all_values'
]
if
hasattr
(
i
,
'__len__'
)])
tables
[
l
[
'id'
]]
=
d
return
data
,
tables
token_dict
=
{}
with
codecs
.
open
(
dict_path
,
'r'
,
'utf8'
)
as
reader
:
for
line
in
reader
:
token
=
line
.
strip
()
token_dict
[
token
]
=
len
(
token_dict
)
class
OurTokenizer
(
Tokenizer
):
def
_tokenize
(
self
,
text
):
R
=
[]
for
c
in
text
:
if
c
in
self
.
_token_dict
:
R
.
append
(
c
)
elif
self
.
_is_space
(
c
):
R
.
append
(
'[unused1]'
)
# space类用未经训练的[unused1]表示
else
:
R
.
append
(
'[UNK]'
)
# 剩余的字符是[UNK]
return
R
tokenizer
=
OurTokenizer
(
token_dict
)
def
most_similar
(
s
,
slist
):
"""从词表中找最相近的词(当无法全匹配的时候)
"""
if
len
(
slist
)
==
0
:
return
s
scores
=
[
editdistance
.
eval
(
s
,
t
)
for
t
in
slist
]
return
slist
[
np
.
argmin
(
scores
)]
def
most_similar_2
(
w
,
s
):
"""从句子s中找与w最相近的片段,
借助分词工具和ngram的方式尽量精确地确定边界。
"""
sw
=
jieba
.
lcut
(
s
)
sl
=
list
(
sw
)
sl
.
extend
([
''
.
join
(
i
)
for
i
in
zip
(
sw
,
sw
[
1
:])])
sl
.
extend
([
''
.
join
(
i
)
for
i
in
zip
(
sw
,
sw
[
1
:],
sw
[
2
:])])
return
most_similar
(
w
,
sl
)
class
MyDataset
(
Dataset
):
def
__init__
(
self
,
data
,
tables
):
self
.
data
=
data
self
.
tables
=
tables
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
i
):
d
=
self
.
data
[
i
]
# [CLS] question [SEP] [CLS] col1 [SEP] [CLS] col2 [SEP]
x1
=
tokenizer
.
encode
(
d
[
'question'
])[
0
]
xm
=
[
0
]
+
[
1
]
*
len
(
d
[
'question'
])
+
[
0
]
h
=
[]
for
j
in
self
.
tables
[
d
[
'table_id'
]][
'headers'
]:
_x1
=
tokenizer
.
encode
(
j
)[
0
]
h
.
append
(
len
(
x1
))
x1
.
extend
(
_x1
)
if
len
(
x1
)
>
maxlen
:
return
hm
=
[
1
]
*
len
(
h
)
# 列的mask
# 列是否被选择
sel
=
[]
for
j
in
range
(
len
(
h
)):
if
j
in
d
[
'sql'
][
'sel'
]:
j
=
d
[
'sql'
][
'sel'
].
index
(
j
)
sel
.
append
(
d
[
'sql'
][
'agg'
][
j
])
else
:
sel
.
append
(
num_agg
-
1
)
# 不被select则被标记为num_agg-1
conn
=
[
d
[
'sql'
][
'cond_conn_op'
]]
csel
=
np
.
zeros
(
len
(
d
[
'question'
])
+
2
,
dtype
=
'int32'
)
# 这里的0既表示padding,又表示第一列,padding部分训练时会被mask
cop
=
np
.
zeros
(
len
(
d
[
'question'
])
+
2
,
dtype
=
'int32'
)
+
num_op
-
1
# 不被select则被标记为num_op-1
for
j
in
d
[
'sql'
][
'conds'
]:
if
j
[
2
]
not
in
d
[
'question'
]:
j
[
2
]
=
most_similar_2
(
j
[
2
],
d
[
'question'
])
if
j
[
2
]
not
in
d
[
'question'
]:
continue
k
=
d
[
'question'
].
index
(
j
[
2
])
csel
[
k
+
1
:
k
+
1
+
len
(
j
[
2
])]
=
j
[
0
]
cop
[
k
+
1
:
k
+
1
+
len
(
j
[
2
])]
=
j
[
1
]
# x1: bert的输入 [101, 123, 121, 122, 123, 2399, 122, 118, 126, 3299, 5168, 6369, 2832, 6598, ...]
# xm: bert输入mask [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]
# h: 列名[CLS]所在位置 [56, 60, 74, 89, 104, 114, 123, 132]
# hm: 列名mask [1, 1, 1, 1, 1, 1, 1, 1]
# sel: 被select查找的列 [4, 6, 6, 6, 6, 6, 6, 6], 6表示列未被select,4表示COUNT
# conn: 连接类型 [1], 1表示and
# csel: 条件中的列 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# cop: 条件中的运算符(同时也是值的标记) [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
return
x1
,
xm
,
h
,
hm
,
sel
,
conn
,
csel
,
cop
def
collate_fn
(
batch
):
x1
,
xm
,
h
,
hm
,
sel
,
conn
,
csel
,
cop
=
zip
(
*
[
i
for
i
in
batch
if
i
])
x1
=
torch
.
tensor
(
sequence_padding
(
x1
),
dtype
=
torch
.
long
,
device
=
device
)
xm
=
torch
.
tensor
(
sequence_padding
(
xm
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
h
=
torch
.
tensor
(
sequence_padding
(
h
),
dtype
=
torch
.
long
,
device
=
device
)
hm
=
torch
.
tensor
(
sequence_padding
(
hm
),
dtype
=
torch
.
long
,
device
=
device
)
sel
=
torch
.
tensor
(
sequence_padding
(
sel
),
dtype
=
torch
.
long
,
device
=
device
)
conn
=
torch
.
tensor
(
sequence_padding
(
conn
),
dtype
=
torch
.
long
,
device
=
device
)
csel
=
torch
.
tensor
(
sequence_padding
(
csel
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
cop
=
torch
.
tensor
(
sequence_padding
(
cop
,
length
=
x1
.
shape
[
1
]),
dtype
=
torch
.
long
,
device
=
device
)
return
[
x1
,
h
,
hm
],
[
sel
,
conn
,
csel
,
cop
,
xm
,
hm
]
datadir
=
'F:/Projects/data/corpus/other/ZhuiyiTechnology_NL2SQL'
train_dataloader
=
DataLoader
(
MyDataset
(
*
read_data
(
f
'
{
datadir
}
/train/train.json'
,
f
'
{
datadir
}
/train/train.tables.json'
)),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_data
,
valid_table
=
read_data
(
f
'
{
datadir
}
/val/val.json'
,
f
'
{
datadir
}
/val/val.tables.json'
)
test_data
,
test_table
=
read_data
(
f
'
{
datadir
}
/test/test.json'
,
f
'
{
datadir
}
/test/test.tables.json'
)
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
hidden_size
=
self
.
bert
.
configs
[
'hidden_size'
]
self
.
conn
=
nn
.
Linear
(
hidden_size
,
num_cond_conn_op
)
self
.
agg
=
nn
.
Linear
(
hidden_size
,
num_agg
)
self
.
op
=
nn
.
Linear
(
hidden_size
,
num_op
)
self
.
dense1
=
nn
.
Linear
(
hidden_size
,
256
)
self
.
dense2
=
nn
.
Linear
(
hidden_size
,
256
)
self
.
dense3
=
nn
.
Linear
(
256
,
1
)
def
forward
(
self
,
x1_in
,
h
,
hm
):
x
=
self
.
bert
([
x1_in
])
# cls判断条件连接符 {0:"", 1:"and", 2:"or"}
x4conn
=
x
[:,
0
]
# [cls位]
pconn
=
self
.
conn
(
x4conn
)
# [btz, num_cond_conn_op]
# 列的cls位用来判断列名的agg和是否被select {0:"", 1:"AVG", 2:"MAX", 3:"MIN", 4:"COUNT", 5:"SUM", 6:"不被select"}
x4h
=
torch
.
gather
(
x
,
dim
=
1
,
index
=
h
.
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
768
))
# [btz, col_len, hdsz]
psel
=
self
.
agg
(
x4h
)
# [btz, col_len, num_agg]
# 序列标注conds的值和运算符
pcop
=
self
.
op
(
x
)
# [btz, seq_len, num_op]
x
=
x
.
unsqueeze
(
2
)
# [btz, seq_len, 1, hdsz]
x4h
=
x4h
.
unsqueeze
(
1
)
# [btz, 1, col_len, hdsz]
pcsel_1
=
self
.
dense1
(
x
)
# [btz, seq_len, 1, 256]
pcsel_2
=
self
.
dense2
(
x4h
)
# [btz, 1, col_len, 256]
pcsel
=
pcsel_1
+
pcsel_2
pcsel
=
torch
.
tanh
(
pcsel
)
pcsel
=
self
.
dense3
(
pcsel
)
# [btz, seq_len, col_len, 1]
pcsel
=
pcsel
[...,
0
]
-
(
1
-
hm
[:,
None
])
*
1e10
# [btz, seq_len, col_len]
return
pconn
,
psel
,
pcop
,
pcsel
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
pconn
,
psel
,
pcop
,
pcsel
=
outputs
sel_in
,
conn_in
,
csel_in
,
cop_in
,
xm
,
hm
=
labels
cm
=
torch
.
not_equal
(
cop_in
,
num_op
-
1
)
batch_size
=
psel
.
shape
[
0
]
psel_loss
=
F
.
cross_entropy
(
psel
.
view
(
-
1
,
num_agg
),
sel_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
psel_loss
=
torch
.
sum
(
psel_loss
*
hm
)
/
torch
.
sum
(
hm
)
pconn_loss
=
F
.
cross_entropy
(
pconn
,
conn_in
.
view
(
-
1
))
pcop_loss
=
F
.
cross_entropy
(
pcop
.
view
(
-
1
,
num_op
),
cop_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
pcop_loss
=
torch
.
sum
(
pcop_loss
*
xm
)
/
torch
.
sum
(
xm
)
pcsel_loss
=
F
.
cross_entropy
(
pcsel
.
view
(
-
1
,
pcsel
.
shape
[
-
1
]),
csel_in
.
view
(
-
1
),
reduction
=
'none'
).
reshape
(
batch_size
,
-
1
)
pcsel_loss
=
torch
.
sum
(
pcsel_loss
*
xm
*
cm
)
/
torch
.
sum
(
xm
*
cm
)
loss
=
psel_loss
+
pconn_loss
+
pcop_loss
+
pcsel_loss
return
{
'loss'
:
loss
,
'psel_loss'
:
psel_loss
,
'pconn_loss'
:
pconn_loss
,
'pcop_loss'
:
pcop_loss
,
'pcsel_loss'
:
pcsel_loss
}
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
learning_rate
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
len
(
train_dataloader
),
len
(
train_dataloader
)
*
epochs
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optimizer
,
scheduler
=
scheduler
)
def
nl2sql
(
question
,
table
):
"""输入question和headers,转SQL
"""
x1
=
tokenizer
.
encode
(
question
)[
0
]
h
=
[]
for
i
in
table
[
'headers'
]:
_x1
=
tokenizer
.
encode
(
i
)[
0
]
h
.
append
(
len
(
x1
))
x1
.
extend
(
_x1
)
hm
=
[
1
]
*
len
(
h
)
pconn
,
psel
,
pcop
,
pcsel
=
model
.
predict
([
torch
.
tensor
([
x1
],
dtype
=
torch
.
long
,
device
=
device
),
torch
.
tensor
([
h
],
dtype
=
torch
.
long
,
device
=
device
),
torch
.
tensor
([
hm
],
dtype
=
torch
.
long
,
device
=
device
)
])
pconn
,
psel
,
pcop
,
pcsel
=
pconn
.
cpu
().
numpy
(),
psel
.
cpu
().
numpy
(),
pcop
.
cpu
().
numpy
(),
pcsel
.
cpu
().
numpy
()
R
=
{
'agg'
:
[],
'sel'
:
[]}
for
i
,
j
in
enumerate
(
psel
[
0
].
argmax
(
1
)):
if
j
!=
num_agg
-
1
:
# num_agg-1类是不被select的意思
R
[
'sel'
].
append
(
i
)
R
[
'agg'
].
append
(
int
(
j
))
conds
=
[]
v_op
=
-
1
for
i
,
j
in
enumerate
(
pcop
[
0
,
:
len
(
question
)
+
1
].
argmax
(
1
)):
# 这里结合标注和分类来预测条件
if
j
!=
num_op
-
1
:
if
v_op
!=
j
:
if
v_op
!=
-
1
:
v_end
=
v_start
+
len
(
v_str
)
csel
=
pcsel
[
0
][
v_start
:
v_end
].
mean
(
0
).
argmax
()
conds
.
append
((
csel
,
v_op
,
v_str
))
v_start
=
i
v_op
=
j
v_str
=
question
[
i
-
1
]
else
:
v_str
+=
question
[
i
-
1
]
elif
v_op
!=
-
1
:
v_end
=
v_start
+
len
(
v_str
)
csel
=
pcsel
[
0
][
v_start
:
v_end
].
mean
(
0
).
argmax
()
conds
.
append
((
csel
,
v_op
,
v_str
))
v_op
=
-
1
R
[
'conds'
]
=
set
()
for
i
,
j
,
k
in
conds
:
if
re
.
findall
(
'[^\d\.]'
,
k
):
j
=
2
# 非数字只能用等号
if
j
==
2
:
if
k
not
in
table
[
'all_values'
]:
# 等号的值必须在table出现过,否则找一个最相近的
k
=
most_similar
(
k
,
list
(
table
[
'all_values'
]))
h
=
table
[
'headers'
][
i
]
# 然后检查值对应的列是否正确,如果不正确,直接修正列名
if
k
not
in
table
[
'content'
][
h
]:
for
r
,
v
in
table
[
'content'
].
items
():
if
k
in
v
:
i
=
table
[
'header2id'
][
r
]
break
R
[
'conds'
].
add
((
int
(
i
),
int
(
j
),
str
(
k
)))
R
[
'conds'
]
=
list
(
R
[
'conds'
])
if
len
(
R
[
'conds'
])
<=
1
:
# 条件数少于等于1时,条件连接符直接为0
R
[
'cond_conn_op'
]
=
0
else
:
R
[
'cond_conn_op'
]
=
1
+
int
(
pconn
[
0
,
1
:].
argmax
())
# 不能是0
return
R
def
is_equal
(
R1
,
R2
):
"""判断两个SQL字典是否全匹配
"""
return
(
R1
[
'cond_conn_op'
]
==
R2
[
'cond_conn_op'
])
&
\
(
set
(
zip
(
R1
[
'sel'
],
R1
[
'agg'
]))
==
set
(
zip
(
R2
[
'sel'
],
R2
[
'agg'
])))
&
\
(
set
([
tuple
(
i
)
for
i
in
R1
[
'conds'
]])
==
set
([
tuple
(
i
)
for
i
in
R2
[
'conds'
]]))
class
Evaluate
(
Callback
):
def
__init__
(
self
):
self
.
accs
=
[]
self
.
best
=
0.
self
.
passed
=
0
self
.
stage
=
0
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
acc
=
self
.
evaluate
(
valid_data
,
valid_table
)
self
.
accs
.
append
(
acc
)
if
acc
>
self
.
best
:
self
.
best
=
acc
# model.save_weights('best_model.weights')
print
(
'acc: %.5f, best acc: %.5f
\n
'
%
(
acc
,
self
.
best
))
def
evaluate
(
self
,
data
,
tables
):
right
=
0.
pbar
=
tqdm
()
F
=
open
(
'evaluate_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
for
i
,
d
in
enumerate
(
data
):
question
=
d
[
'question'
]
table
=
tables
[
d
[
'table_id'
]]
R
=
nl2sql
(
question
,
table
)
right
+=
float
(
is_equal
(
R
,
d
[
'sql'
]))
pbar
.
update
(
1
)
pbar
.
set_description
(
'< acc: %.5f >'
%
(
right
/
(
i
+
1
)))
d
[
'sql_pred'
]
=
R
try
:
s
=
json
.
dumps
(
d
,
ensure_ascii
=
False
,
indent
=
4
)
except
:
continue
F
.
write
(
s
+
'
\n
'
)
F
.
close
()
pbar
.
close
()
return
right
/
len
(
data
)
def
test
(
self
,
data
,
tables
,
outfile
=
'result.json'
):
pbar
=
tqdm
()
F
=
open
(
outfile
,
'w'
)
for
i
,
d
in
enumerate
(
data
):
question
=
d
[
'question'
]
table
=
tables
[
d
[
'table_id'
]]
R
=
nl2sql
(
question
,
table
)
pbar
.
update
(
1
)
s
=
json
.
dumps
(
R
,
ensure_ascii
=
False
)
F
.
write
(
s
.
encode
(
'utf-8'
)
+
'
\n
'
)
F
.
close
()
pbar
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluate
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.weights'
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/pretrain/roberta_pretrain/pretrain_roberta_mlm.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 预训练脚本,单GPU版方便测试
# 改DDP需几行代码,参考https://github.com/Tongjilibo/bert4torch/blob/master/examples/training_trick/task_distributed_data_parallel.py
from
bert4torch.models
import
build_transformer_model
from
bert4torch.snippets
import
sequence_padding
,
Callback
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
from
torch.utils.data
import
Dataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
json
import
os
import
shelve
import
random
import
time
# 语料路径和模型保存路径
model_saved_path
=
'./bert_model.ckpt'
dir_training_data
=
'E:/Github/bert4torch/examples/datasets/pretrain'
# dir_training_data
task_name
=
'roberta'
# 其他配置
maxlen
=
512
batch_size
=
7
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
# 如果从零训练,就设为None
learning_rate
=
0.00176
weight_decay_rate
=
0.01
# 权重衰减
num_warmup_steps
=
3125
num_train_steps
=
125000
steps_per_epoch
=
10000
grad_accum_steps
=
16
# 大于1即表明使用梯度累积
epochs
=
num_train_steps
*
grad_accum_steps
//
steps_per_epoch
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 读取数据集,构建数据张量
class
MyDataset
(
Dataset
):
def
__init__
(
self
,
file
):
super
(
MyDataset
,
self
).
__init__
()
self
.
file
=
file
self
.
len
=
self
.
_get_dataset_length
()
self
.
db
=
self
.
_load_data
()
def
__getitem__
(
self
,
index
):
return
self
.
db
[
str
(
index
)]
def
__len__
(
self
):
return
self
.
len
def
_get_dataset_length
(
self
):
file_record_info
=
self
.
file
+
".json"
record_info
=
json
.
load
(
open
(
file_record_info
,
"r"
,
encoding
=
"utf-8"
))
return
record_info
[
"samples_num"
]
def
_load_data
(
self
):
return
shelve
.
open
(
self
.
file
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
item
in
batch
:
batch_token_ids
.
append
(
item
[
'input_ids'
])
batch_labels
.
append
(
item
[
'masked_lm_labels'
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
batch_labels
# 从语料文件夹中随机选取一个文件,生成dataloader
def
get_train_dataloader
():
while
True
:
# prepare dataset
files_training_data
=
os
.
listdir
(
dir_training_data
)
files_training_data
=
[
file
.
split
(
"."
)[
0
]
for
file
in
files_training_data
if
"train"
in
file
]
# 防止使用到正在生成的文件
files_training_data
=
[
i
for
i
in
set
(
files_training_data
)
if
files_training_data
.
count
(
i
)
==
4
]
if
files_training_data
:
file_train
=
random
.
choice
(
files_training_data
)
for
suffix
in
[
".bak"
,
".dat"
,
".dir"
,
".json"
]:
file_old
=
os
.
path
.
join
(
dir_training_data
,
file_train
+
suffix
)
file_new
=
os
.
path
.
join
(
dir_training_data
,
task_name
+
suffix
)
os
.
renames
(
file_old
,
file_new
)
cur_load_file
=
file_new
.
split
(
"."
)[
0
]
train_dataloader
=
DataLoader
(
MyDataset
(
cur_load_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
break
else
:
print
(
"No training data! Sleep 300s!"
)
time
.
sleep
(
10
)
continue
return
train_dataloader
train_dataloader
=
get_train_dataloader
()
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_mlm
=
True
).
to
(
device
)
# weight decay
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
weight_decay_rate
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
output
,
batch_labels
):
y_preds
=
output
[
-
1
]
y_preds
=
y_preds
.
reshape
(
-
1
,
y_preds
.
shape
[
-
1
])
return
super
().
forward
(
y_preds
,
batch_labels
.
flatten
())
# 定义使用的loss和optimizer,这里支持自定义
optimizer
=
optim
.
Adam
(
optimizer_grouped_parameters
,
lr
=
learning_rate
,
weight_decay
=
weight_decay_rate
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
num_warmup_steps
,
num_training_steps
=
num_train_steps
)
model
.
compile
(
loss
=
MyLoss
(
ignore_index
=
0
),
optimizer
=
optimizer
,
scheduler
=
scheduler
)
class
ModelCheckpoint
(
Callback
):
"""自动保存最新模型
"""
def
on_dataloader_end
(
self
,
logs
=
None
):
# 在dataloader结束的时候,关闭db并且删除训练的文件
model
.
train_dataloader
.
dataset
.
db
.
close
()
for
suffix
in
[
".bak"
,
".dat"
,
".dir"
,
".json"
]:
file_remove
=
os
.
path
.
join
(
dir_training_data
,
task_name
+
suffix
)
try
:
os
.
remove
(
file_remove
)
except
:
print
(
f
"Failed to remove training data
{
file_remove
}
."
)
# 重新生成dataloader
model
.
train_dataloader
=
get_train_dataloader
()
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
model
.
save_weights
(
model_saved_path
)
if
__name__
==
'__main__'
:
# 保存模型
checkpoint
=
ModelCheckpoint
()
# 模型训练
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
grad_accumulation_steps
=
grad_accum_steps
,
epochs
=
epochs
,
callbacks
=
[
checkpoint
],
)
bert/bert4torch_cmcc/examples/pretrain/roberta_pretrain/pretrain_roberta_mlm_data_gen.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# 预训练语料构建,这里实现的mlm任务的,NSP和SOP未使用
# 方案:一直动态生成文件,超过最大保存数目时候sleep,
# 当训练速度超过文件生成速度时候,可开启多个数据生成脚本
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
import
json
,
glob
,
re
from
tqdm
import
tqdm
import
collections
import
gc
import
shelve
import
time
import
os
import
random
import
jieba
jieba
.
initialize
()
class
TrainingDataset
(
object
):
"""预训练数据集生成器
"""
def
__init__
(
self
,
tokenizer
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self
.
tokenizer
=
tokenizer
self
.
sequence_length
=
sequence_length
self
.
token_pad_id
=
tokenizer
.
_token_pad_id
self
.
token_cls_id
=
tokenizer
.
_token_start_id
self
.
token_sep_id
=
tokenizer
.
_token_end_id
self
.
token_mask_id
=
tokenizer
.
_token_mask_id
self
.
vocab_size
=
tokenizer
.
_vocab_size
def
padding
(
self
,
sequence
,
padding_value
=
None
):
"""对单个序列进行补0
"""
if
padding_value
is
None
:
padding_value
=
self
.
token_pad_id
sequence
=
sequence
[:
self
.
sequence_length
]
padding_length
=
self
.
sequence_length
-
len
(
sequence
)
return
sequence
+
[
padding_value
]
*
padding_length
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数,返回处理后的instance
"""
raise
NotImplementedError
def
paragraph_process
(
self
,
texts
,
starts
,
ends
,
paddings
):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances
,
instance
=
[],
[[
start
]
for
start
in
starts
]
for
text
in
texts
:
# 处理单个句子
sub_instance
=
self
.
sentence_process
(
text
)
sub_instance
=
[
i
[:
self
.
sequence_length
-
2
]
for
i
in
sub_instance
]
new_length
=
len
(
instance
[
0
])
+
len
(
sub_instance
[
0
])
# 如果长度即将溢出
if
new_length
>
self
.
sequence_length
-
1
:
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储结果,并构建新样本
instances
.
append
(
complete_instance
)
instance
=
[[
start
]
for
start
in
starts
]
# 样本续接
for
item
,
sub_item
in
zip
(
instance
,
sub_instance
):
item
.
extend
(
sub_item
)
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储最后的instance
instances
.
append
(
complete_instance
)
return
instances
def
serialize
(
self
,
instances
,
db
,
count
):
"""写入到文件
"""
for
instance
in
instances
:
input_ids
,
masked_lm_labels
=
instance
[
0
],
instance
[
1
]
assert
len
(
input_ids
)
<=
sequence_length
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
input_ids
features
[
"masked_lm_labels"
]
=
masked_lm_labels
db
[
str
(
count
)]
=
features
count
+=
1
return
count
def
process
(
self
,
corpus
,
record_name
):
"""处理输入语料(corpus)
"""
count
=
0
db
=
shelve
.
open
(
record_name
)
for
texts
in
corpus
:
instances
=
self
.
paragraph_process
(
texts
)
count
=
self
.
serialize
(
instances
,
db
,
count
)
db
.
close
()
del
instances
gc
.
collect
()
# 记录对应的文件名和样本量
record_info
=
{
"filename"
:
record_name
,
"samples_num"
:
count
}
json
.
dump
(
record_info
,
open
(
record_name
+
".json"
,
"w"
,
encoding
=
"utf-8"
))
print
(
'write %s examples into %s'
%
(
count
,
record_name
))
class
TrainingDatasetRoBERTa
(
TrainingDataset
):
"""预训练数据集生成器(RoBERTa模式)
"""
def
__init__
(
self
,
tokenizer
,
word_segment
,
mask_rate
=
0.15
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4torch自带的tokenizer类;
word_segment是任意分词函数。
"""
super
(
TrainingDatasetRoBERTa
,
self
).
__init__
(
tokenizer
,
sequence_length
)
self
.
word_segment
=
word_segment
self
.
mask_rate
=
mask_rate
def
token_process
(
self
,
token_id
):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand
=
np
.
random
.
random
()
if
rand
<=
0.8
:
return
self
.
token_mask_id
elif
rand
<=
0.9
:
return
token_id
else
:
return
np
.
random
.
randint
(
0
,
self
.
vocab_size
)
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列, 来指定哪些token是否要被mask
"""
words
=
self
.
word_segment
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
token_ids
,
mask_ids
=
[],
[]
for
rand
,
word
in
zip
(
rands
,
words
):
word_tokens
=
self
.
tokenizer
.
tokenize
(
text
=
word
)[
1
:
-
1
]
word_token_ids
=
self
.
tokenizer
.
tokens_to_ids
(
word_tokens
)
if
rand
<
self
.
mask_rate
:
word_mask_ids
=
[
self
.
token_process
(
i
)
for
i
in
word_token_ids
]
token_ids
.
extend
(
word_mask_ids
)
mask_ids
.
extend
(
word_token_ids
)
else
:
token_ids
.
extend
(
word_token_ids
)
word_mask_ids
=
[
0
]
*
len
(
word_tokens
)
mask_ids
.
extend
(
word_mask_ids
)
return
[
token_ids
,
mask_ids
]
def
paragraph_process
(
self
,
texts
):
"""给原方法补上starts、ends、paddings
"""
starts
=
[
self
.
token_cls_id
,
0
]
ends
=
[
self
.
token_sep_id
,
0
]
paddings
=
[
self
.
token_pad_id
,
0
]
return
super
(
TrainingDatasetRoBERTa
,
self
).
paragraph_process
(
texts
,
starts
,
ends
,
paddings
)
if
__name__
==
'__main__'
:
sequence_length
=
512
# 文本长度
max_file_num
=
40
# 最大保存的文件个数
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
# 字典文件
dir_training_data
=
'E:/Github/bert4torch/examples/datasets/pretrain'
# 保存的文件目录
dir_corpus
=
'F:/Projects/data/corpus/pretrain'
# 读入的语料地址
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
some_texts
():
'''挑选语料
'''
files_corpus
=
glob
.
glob
(
f
'
{
dir_corpus
}
/*/*'
)
# 根据目录结构自行调整
file_corpus
=
random
.
choice
(
files_corpus
)
# 随机挑选一篇文章
count
,
texts
=
0
,
[]
with
open
(
file_corpus
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
,
desc
=
f
'Load data from
{
file_corpus
}
'
):
l
=
l
.
strip
()
texts
.
extend
(
re
.
findall
(
u
'.*?[
\n
。]+'
,
l
))
count
+=
1
if
count
==
10
:
# 10篇文章合在一起再处理
yield
texts
count
,
texts
=
0
,
[]
if
texts
:
yield
texts
def
word_segment
(
text
):
return
jieba
.
lcut
(
text
)
TD
=
TrainingDatasetRoBERTa
(
tokenizer
,
word_segment
,
sequence_length
=
sequence_length
)
while
True
:
train_files
=
[
file
for
file
in
os
.
listdir
(
dir_training_data
)
if
(
'train_'
in
file
)
and
(
'dat'
in
file
)]
# 当保存的训练文件未达到指定数量时
if
len
(
train_files
)
<
max_file_num
:
record_name
=
f
'
{
dir_training_data
}
/train_'
+
time
.
strftime
(
'%Y%m%d%H%M%S'
,
time
.
localtime
())
TD
.
process
(
corpus
=
some_texts
(),
record_name
=
record_name
)
time
.
sleep
(
1
)
# 可不加,这里是防止生成文件名一样
else
:
time
.
sleep
(
300
)
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_stage1.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage1,训练方式和simbert类似+[MASK预测]
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
AutoRegressiveDecoder
from
bert4torch.snippets
import
Callback
,
truncate_sequences
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
import
jieba
jieba
.
initialize
()
# 基本信息
maxlen
=
64
batch_size
=
12
# bert配置,加载roformer权重
config_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 这里语料没有官方的丰富,可用自定义预料
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
masked_encode
(
text
):
"""wwm随机mask
"""
words
=
jieba
.
lcut
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
source
,
target
=
[
tokenizer
.
_token_start_id
],
[
0
]
for
r
,
w
in
zip
(
rands
,
words
):
ids
=
tokenizer
.
encode
(
w
)[
0
][
1
:
-
1
]
if
r
<
0.15
*
0.8
:
source
.
extend
([
tokenizer
.
_token_mask_id
]
*
len
(
ids
))
target
.
extend
(
ids
)
elif
r
<
0.15
*
0.9
:
source
.
extend
(
ids
)
target
.
extend
(
ids
)
elif
r
<
0.15
:
source
.
extend
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
,
size
=
len
(
ids
))
+
1
)
target
.
extend
(
ids
)
else
:
source
.
extend
(
ids
)
target
.
extend
([
0
]
*
len
(
ids
))
source
=
source
[:
maxlen
-
1
]
+
[
tokenizer
.
_token_end_id
]
target
=
target
[:
maxlen
-
1
]
+
[
0
]
return
source
,
target
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
for
_
in
range
(
2
):
text
,
synonym
=
synonyms
[:
2
]
if
np
.
random
.
random
()
<
0.5
:
text_ids
=
masked_encode
(
text
)[
0
]
else
:
text_ids
=
tokenizer
.
encode
(
text
)[
0
]
synonym_ids
=
tokenizer
.
encode
(
synonym
)[
0
][
1
:]
truncate_sequences
(
maxlen
*
2
,
-
2
,
text_ids
,
synonym_ids
)
token_ids
=
text_ids
+
synonym_ids
segment_ids
=
[
0
]
*
len
(
text_ids
)
+
[
1
]
*
len
(
synonym_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
text
,
synonym
=
synonym
,
text
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
with_mlm
=
True
,
dropout_rate
=
0.2
,
application
=
'unilm'
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
):
y_true
=
self
.
get_labels_of_similarity
(
y_pred
)
# 构建标签
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
similarities
=
similarities
-
torch
.
eye
(
y_pred
.
shape
[
0
],
device
=
device
)
*
1e12
# 排除对角线
similarities
=
similarities
*
30
# scale
loss
=
F
.
cross_entropy
(
similarities
,
y_true
)
return
loss
def
get_labels_of_similarity
(
self
,
y_pred
):
idxs
=
torch
.
arange
(
0
,
y_pred
.
shape
[
0
],
device
=
device
)
idxs_1
=
idxs
[
None
,
:]
idxs_2
=
(
idxs
+
1
-
idxs
%
2
*
2
)[:,
None
]
labels
=
idxs_1
.
eq
(
idxs_2
).
float
()
return
labels
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_stage2.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# SimBERT_v2预训练代码stage2,把simbert的相似度蒸馏到roformer-sim上
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
get_pool_emb
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
truncate_sequences
from
bert4torch.tokenizers
import
Tokenizer
import
jieba
jieba
.
initialize
()
# 基本信息
maxlen
=
64
batch_size
=
12
# bert配置,需要加载stage1训练后的权重,这里直接加载官方最终的权重以示例
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 这里语料和stage1保持一致
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
masked_encode
(
text
):
"""wwm随机mask
"""
words
=
jieba
.
lcut
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
source
,
target
=
[
tokenizer
.
_token_start_id
],
[
0
]
for
r
,
w
in
zip
(
rands
,
words
):
ids
=
tokenizer
.
encode
(
w
)[
0
][
1
:
-
1
]
if
r
<
0.15
*
0.8
:
source
.
extend
([
tokenizer
.
_token_mask_id
]
*
len
(
ids
))
target
.
extend
(
ids
)
elif
r
<
0.15
*
0.9
:
source
.
extend
(
ids
)
target
.
extend
(
ids
)
elif
r
<
0.15
:
source
.
extend
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
,
size
=
len
(
ids
))
+
1
)
target
.
extend
(
ids
)
else
:
source
.
extend
(
ids
)
target
.
extend
([
0
]
*
len
(
ids
))
source
=
source
[:
maxlen
-
1
]
+
[
tokenizer
.
_token_end_id
]
target
=
target
[:
maxlen
-
1
]
+
[
0
]
return
source
,
target
# ========== 蒸馏用:开始 ==========
# simbert配置
sim_config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
sim_checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
sim_dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
# 建立分词器
sim_tokenizer
=
Tokenizer
(
sim_dict_path
,
do_lower_case
=
True
)
# 建立分词器
# 建立加载模型
simbert
=
build_transformer_model
(
sim_config_path
,
sim_checkpoint_path
,
with_pool
=
'linear'
,
application
=
'unilm'
).
to
(
device
)
# ========== 蒸馏用:结束 ==========
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_sim_token_ids
,
batch_sim_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
for
_
in
range
(
2
):
text
,
synonym
=
synonyms
[:
2
]
if
np
.
random
.
random
()
<
0.5
:
text_ids
=
masked_encode
(
text
)[
0
]
else
:
text_ids
=
tokenizer
.
encode
(
text
)[
0
]
synonym_ids
=
tokenizer
.
encode
(
synonym
)[
0
][
1
:]
truncate_sequences
(
maxlen
*
2
,
-
2
,
text_ids
,
synonym_ids
)
token_ids
=
text_ids
+
synonym_ids
segment_ids
=
[
0
]
*
len
(
text_ids
)
+
[
1
]
*
len
(
synonym_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
# ==== 蒸馏用:开始 ====
token_ids
,
segment_ids
=
sim_tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_sim_token_ids
.
append
(
token_ids
)
batch_sim_segment_ids
.
append
(
segment_ids
)
# ==== 蒸馏用:结束 ====
text
,
synonym
=
synonym
,
text
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# ==== 蒸馏用:开始 ====
batch_sim_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_sim_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_sim_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_sim_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
sim_vecs
=
simbert
.
predict
([
batch_sim_token_ids
,
batch_sim_segment_ids
])[
1
]
sim_vecs
/=
(
sim_vecs
**
2
).
sum
(
dim
=-
1
,
keepdims
=
True
)
**
0.5
sims
=
torch
.
matmul
(
sim_vecs
,
sim_vecs
.
T
)
# ==== 蒸馏用:结束 ====
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
,
sims
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
with_mlm
=
True
,
dropout_rate
=
0.2
,
application
=
'unilm'
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
,
sims
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
,
sims
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
,
y_true
):
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
loss
=
100
*
torch
.
mean
((
similarities
-
y_true
)
**
2
)
return
loss
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/pretrain/simbert_v2_pretrain/simbert_v2_supervised.py
0 → 100644
View file @
0e29b9b7
#! -*- coding: utf-8 -*-
# SimBERT_v2监督训练代码supervised部分
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-sim
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
from
bert4torch.snippets
import
Callback
,
truncate_sequences
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
import
json
import
glob
# 基本信息
maxlen
=
64
batch_size
=
12
labels
=
[
'contradiction'
,
'entailment'
,
'neutral'
]
# bert配置,需要加载stage2训练后的权重,这里直接加载官方最终的权重以示例
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
split
(
text
):
"""分割句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
*
1.2
,
seps
,
strips
)
class
MyDataset
(
ListDataset
):
def
load_data
(
self
,
file_path
):
dataset1_path
,
dataset2_path
=
file_path
D1
=
self
.
load_data_1
(
dataset1_path
)
D2
=
self
.
load_data_2
(
dataset2_path
)
return
D1
+
D2
@
staticmethod
def
load_data_1
(
filenames
,
threshold
=
0.5
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
l
[
0
],
l
[
1
]
=
split
(
l
[
0
])[
0
],
split
(
l
[
1
])[
0
]
D
.
append
((
l
[
0
],
l
[
1
],
int
(
float
(
l
[
2
])
>
threshold
)))
return
D
@
staticmethod
def
load_data_2
(
dir_path
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
glob
.
glob
(
dir_path
):
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'gold_label'
]
not
in
labels
:
continue
text1
=
split
(
l
[
'sentence1'
])[
0
]
text2
=
split
(
l
[
'sentence2'
])[
0
]
label
=
labels
.
index
(
l
[
'gold_label'
])
+
2
D
.
append
((
text1
,
text2
,
label
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
for
text
in
[
text1
,
text2
]:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
# 加载数据集
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
dataset1_path
=
[]
for
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]:
for
f
in
[
'train'
,
'valid'
]:
threshold
=
2.5
if
task_name
==
'STS-B'
else
0.5
filename
=
'%s%s/%s.%s.data'
%
(
data_path
,
task_name
,
task_name
,
f
)
dataset1_path
.
append
(
filename
)
dataset2_path
=
'F:/Projects/data/corpus/sentence_embedding/XNLI-MT-1.0/cnsd/cnsd-*/*.jsonl'
train_dataloader
=
DataLoader
(
MyDataset
([
dataset1_path
,
dataset2_path
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
'linear'
,
dropout_rate
=
0.2
)
self
.
pool_method
=
pool_method
self
.
dense
=
nn
.
Linear
(
768
*
3
,
5
,
bias
=
False
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
# [btz*2, hdsz]
# 向量合并:a、b、|a-b|拼接
u
,
v
=
sen_emb
[::
2
],
sen_emb
[
1
::
2
]
sen_emb_concat
=
torch
.
cat
([
u
,
v
,
torch
.
abs
(
u
-
v
)],
dim
=-
1
)
# [btz, hdsz*3]
y_pred
=
self
.
dense
(
sen_emb_concat
)
# [btz, 5]
return
y_pred
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
MyLoss
(
nn
.
Module
):
"""loss分
"""
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
mask
=
torch
.
tensor
([
0
,
0
,
1
,
1
,
1
],
device
=
device
)
def
forward
(
self
,
y_pred
,
y_true
):
'''如果是两分类数据,则把后三位置-inf,如果是三分类数据,把前两位置-inf
'''
task
=
(
y_true
<
1.5
).
long
()
y_pred_1
=
y_pred
-
self
.
mask
*
1e12
y_pred_2
=
y_pred
-
(
1
-
self
.
mask
)
*
1e12
y_pred
=
task
*
y_pred_1
+
(
1
-
task
)
*
y_pred_2
return
F
.
cross_entropy
(
y_pred
,
y_true
.
flatten
())
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.pt'
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment