Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
92c75df1
Commit
92c75df1
authored
Jan 20, 2026
by
sunzhq2
Browse files
yidong infer init
parents
Changes
150
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4152 additions
and
0 deletions
+4152
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
...ng/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
+167
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_CT.py
...es/sentence_embedding/task_sentence_embedding_unsup_CT.py
+212
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
...ng/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
+207
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_DiffCSE.py
...ntence_embedding/task_sentence_embedding_unsup_DiffCSE.py
+306
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_ESimCSE.py
...ntence_embedding/task_sentence_embedding_unsup_ESimCSE.py
+269
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_PromptBert.py
...nce_embedding/task_sentence_embedding_unsup_PromptBert.py
+213
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_SimCSE.py
...entence_embedding/task_sentence_embedding_unsup_SimCSE.py
+189
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
...sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
+221
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_bert_whitening.py
...embedding/task_sentence_embedding_unsup_bert_whitening.py
+158
-0
bert/bert4torch_cmcc/examples/seq2seq/task_kgclue_seq2seq.py
bert/bert4torch_cmcc/examples/seq2seq/task_kgclue_seq2seq.py
+356
-0
bert/bert4torch_cmcc/examples/seq2seq/task_question_answer_generation_by_seq2seq.py
...les/seq2seq/task_question_answer_generation_by_seq2seq.py
+192
-0
bert/bert4torch_cmcc/examples/seq2seq/task_reading_comprehension_by_mlm.py
...mcc/examples/seq2seq/task_reading_comprehension_by_mlm.py
+237
-0
bert/bert4torch_cmcc/examples/seq2seq/task_reading_comprehension_by_seq2seq.py
...examples/seq2seq/task_reading_comprehension_by_seq2seq.py
+268
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
...xamples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
+201
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle.py
...ert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle.py
+139
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
..._cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
+154
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
...h_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
+165
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
...examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
+163
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
...mcc/examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
+161
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
...cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
+174
-0
No files found.
Too many changes to show.
To preserve performance only
150 of 150+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# loss: 句向量concat后 (u, v, u-v, u*v) 走CrossEntropyLoss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
label
=
int
(
label
>
2.5
)
if
task_name
==
'STS-B'
else
label
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
concatenation_sent_rep
=
True
,
concatenation_sent_difference
=
True
,
concatenation_sent_multiplication
=
False
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
concatenation_sent_rep
=
concatenation_sent_rep
self
.
concatenation_sent_difference
=
concatenation_sent_difference
self
.
concatenation_sent_multiplication
=
concatenation_sent_multiplication
hidden_unit
=
0
hidden_unit
+=
768
*
2
if
self
.
concatenation_sent_rep
else
0
hidden_unit
+=
768
if
self
.
concatenation_sent_difference
else
0
hidden_unit
+=
768
if
self
.
concatenation_sent_multiplication
else
0
self
.
fc
=
nn
.
Linear
(
hidden_unit
,
2
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pooler1
=
self
.
bert
([
token1_ids
])
rep_a
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pooler2
=
self
.
bert
([
token2_ids
])
rep_b
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
vectors_concat
=
[]
if
self
.
concatenation_sent_rep
:
vectors_concat
.
append
(
rep_a
)
vectors_concat
.
append
(
rep_b
)
if
self
.
concatenation_sent_difference
:
vectors_concat
.
append
(
torch
.
abs
(
rep_a
-
rep_b
))
if
self
.
concatenation_sent_multiplication
:
vectors_concat
.
append
(
rep_a
*
rep_b
)
vectors_concat
=
torch
.
cat
(
vectors_concat
,
dim
=
1
)
return
self
.
fc
(
vectors_concat
)
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token1_ids
).
cpu
())
embeddings2
.
append
(
model
.
predict
(
batch_token2_ids
).
cpu
())
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_CT.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# ContrastiveTensionLoss: 同一个sentence送入两个模型,pooling后的点积要大
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT | 30.65 | 44.50| 68.67 | 16.20 | 69.27 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
copy
import
random
from
tqdm
import
tqdm
import
numpy
as
np
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
pos_id
=
random
.
randint
(
0
,
len
(
batch
)
-
1
)
pos_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
pos_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
pos_token_ids
)
texts_list
[
1
].
append
(
pos_token_ids
)
labels
.
append
(
1
)
for
neg_id
in
range
(
len
(
batch
)):
if
neg_id
==
pos_id
:
continue
elif
random
.
random
()
<
0.5
:
neg_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
neg_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
pos_token_ids
)
texts_list
[
1
].
append
(
neg_token_ids
)
labels
.
append
(
0
)
else
:
neg_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
neg_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
neg_token_ids
)
texts_list
[
1
].
append
(
pos_token_ids
)
labels
.
append
(
0
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
model1
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
model2
=
copy
.
deepcopy
(
self
.
model1
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids_list
):
token_ids1
=
token_ids_list
[
0
]
hidden_state1
,
pool_cls1
=
self
.
model1
([
token_ids1
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids1
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids2
=
token_ids_list
[
1
]
hidden_state2
,
pool_cls2
=
self
.
model2
([
token_ids2
])
embeddings_b
=
get_pool_emb
(
hidden_state2
,
pool_cls2
,
token_ids2
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
torch
.
matmul
(
embeddings_a
[:,
None
],
embeddings_b
[:,
:,
None
]).
squeeze
(
-
1
).
squeeze
(
-
1
)
# [btz]
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
model1
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
BCEWithLogitsLoss
(
reduction
=
'mean'
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
# 用足够小的学习率
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# loss: 对比学习损失(和simcse类似),只是用了两个模型而已
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT_In_Batch_Neg | 32.47 | 47.09| 68.56 | 27.50 | 74.00 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
,
paired_euclidean_distances
,
paired_manhattan_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
copy
import
numpy
as
np
from
tqdm
import
tqdm
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
for
text
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
token_ids
)
texts_list
[
1
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
model1
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
model2
=
copy
.
deepcopy
(
self
.
model1
)
self
.
pool_method
=
pool_method
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
token_ids
=
token_ids_list
[
0
]
hidden_state1
,
pooler1
=
self
.
model1
([
token_ids
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids
=
token_ids_list
[
1
]
hidden_state2
,
pooler2
=
self
.
model2
([
token_ids
])
embeddings_b
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
model1
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_DiffCSE.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# DiffCSE中文测试:model, electra部分的gennerator和discriminator都是用的同样的bert模型
# 源项目: https://github.com/voidism/DiffCSE
# 原项目是btz *2 来做mask
from
bert4torch.snippets
import
sequence_padding
from
tqdm
import
tqdm
import
numpy
as
np
import
scipy.stats
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
get_pool_emb
from
torch.utils.data
import
DataLoader
from
torch
import
optim
,
nn
import
torch
from
bert4torch.snippets
import
ListDataset
import
torch.nn.functional
as
F
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
# model_type, pooling, task_name, dropout_rate = sys.argv[1:] # 传入参数
model_type
,
pooling
,
task_name
,
dropout_rate
=
'BERT'
,
'cls'
,
'ATEC'
,
0.3
# debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
lambda_weight
=
0.05
# electra部分loss权重
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
def
mask_tokens
(
inputs
,
special_tokens_mask
=
None
):
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
mlm_probability
=
0.3
special_tokens
=
{
tokenizer
.
_token_start_id
,
tokenizer
.
_token_end_id
,
tokenizer
.
_token_pad_id
,
tokenizer
.
_token_unk_id
,
tokenizer
.
_token_mask_id
}
inputs
=
inputs
.
clone
()
labels
=
inputs
.
clone
()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix
=
torch
.
full
(
labels
.
shape
,
mlm_probability
)
if
special_tokens_mask
is
None
:
special_tokens_mask
=
[[
val
in
special_tokens
for
val
in
smp
]
for
smp
in
labels
.
tolist
()]
special_tokens_mask
=
torch
.
tensor
(
special_tokens_mask
,
dtype
=
torch
.
bool
)
else
:
special_tokens_mask
=
special_tokens_mask
.
bool
()
probability_matrix
.
masked_fill_
(
special_tokens_mask
,
value
=
0.0
)
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
labels
[
~
masked_indices
]
=
-
100
# We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.8
)).
bool
()
&
masked_indices
inputs
[
indices_replaced
]
=
tokenizer
.
_token_mask_id
# 10% of the time, we replace masked input tokens with random word
indices_random
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.5
)).
bool
()
&
masked_indices
&
~
indices_replaced
random_words
=
torch
.
randint
(
tokenizer
.
_vocab_size
,
labels
.
shape
,
dtype
=
torch
.
long
,
device
=
device
)
inputs
[
indices_random
]
=
random_words
[
indices_random
]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return
inputs
,
labels
# 加载训练数据集
def
collate_fn
(
batch
):
input_ids
=
[]
for
text
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
input_ids
.
append
(
token_ids
)
input_ids
.
extend
(
input_ids
)
input_ids
=
torch
.
tensor
(
sequence_padding
(
input_ids
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
len
(
batch
),
device
=
device
)
# mlm_inputs和mlm_outputs
mlm_inputs
,
mlm_labels
=
mask_tokens
(
input_ids
)
attention_mask
=
input_ids
.
gt
(
0
).
long
()
return
[
input_ids
,
mlm_inputs
],
[
labels
,
mlm_labels
,
attention_mask
]
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义generator
generator
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_mlm
=
True
)
generator
.
to
(
device
)
generator
.
eval
()
class
ProjectionMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
):
super
().
__init__
()
in_dim
=
hidden_size
hidden_dim
=
hidden_size
*
2
out_dim
=
hidden_size
affine
=
False
list_layers
=
[
nn
.
Linear
(
in_dim
,
hidden_dim
,
bias
=
False
),
nn
.
BatchNorm1d
(
hidden_dim
),
nn
.
ReLU
(
inplace
=
True
)]
list_layers
+=
[
nn
.
Linear
(
hidden_dim
,
out_dim
,
bias
=
False
),
nn
.
BatchNorm1d
(
out_dim
,
affine
=
affine
)]
self
.
net
=
nn
.
Sequential
(
*
list_layers
)
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
class
Similarity
(
nn
.
Module
):
"""
Dot product or cosine similarity
"""
def
__init__
(
self
,
temp
):
super
().
__init__
()
self
.
temp
=
temp
self
.
cos
=
nn
.
CosineSimilarity
(
dim
=-
1
)
self
.
record
=
None
self
.
pos_avg
=
0.0
self
.
neg_avg
=
0.0
def
forward
(
self
,
x
,
y
):
sim
=
self
.
cos
(
x
,
y
)
self
.
record
=
sim
.
detach
()
min_size
=
min
(
self
.
record
.
shape
[
0
],
self
.
record
.
shape
[
1
])
num_item
=
self
.
record
.
shape
[
0
]
*
self
.
record
.
shape
[
1
]
self
.
pos_avg
=
self
.
record
.
diag
().
sum
()
/
min_size
self
.
neg_avg
=
(
self
.
record
.
sum
()
-
self
.
record
.
diag
().
sum
())
/
(
num_item
-
min_size
)
return
sim
/
self
.
temp
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
mlp
=
ProjectionMLP
(
self
.
bert
.
configs
[
'hidden_size'
])
self
.
discriminator
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
)
self
.
electra_head
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
self
.
sim
=
Similarity
(
temp
=
0.05
)
def
forward
(
self
,
input_ids
,
mlm_inputs
):
# 和ESimCSE一致的计算逻辑
attention_mask
=
input_ids
.
gt
(
0
).
long
()
hidden_state1
,
pooler
=
self
.
bert
([
input_ids
])
reps
=
get_pool_emb
(
hidden_state1
,
pooler
,
attention_mask
,
self
.
pool_method
)
if
self
.
pool_method
==
'cls'
:
reps
=
self
.
mlp
(
reps
)
batch_size
=
input_ids
.
shape
[
0
]
//
2
embeddings_a
=
reps
[:
batch_size
]
embeddings_b
=
reps
[
batch_size
:]
scores
=
self
.
sim
(
embeddings_a
.
unsqueeze
(
1
),
embeddings_b
.
unsqueeze
(
0
))
# [btz, btz]
# Calculate loss for conditional ELECTRA
with
torch
.
no_grad
():
g_pred
=
generator
([
mlm_inputs
])[
1
].
argmax
(
-
1
)
# [btz, seq_len]
g_pred
[:,
0
]
=
tokenizer
.
_token_start_id
e_labels
=
(
g_pred
!=
input_ids
)
*
attention_mask
e_inputs
=
g_pred
*
attention_mask
# cls位置需要用句向量替换
embeddings
=
self
.
discriminator
.
apply_embeddings
([
e_inputs
])
embeddings
[
0
]
=
torch
.
cat
([
reps
.
unsqueeze
(
1
),
embeddings
[
0
][:,
1
:,
:]],
dim
=
1
)
outputs
=
self
.
discriminator
.
apply_main_layers
(
embeddings
)
mlm_outputs
=
self
.
discriminator
.
apply_final_layers
(
outputs
)
prediction_scores
=
self
.
electra_head
(
mlm_outputs
)
return
scores
,
prediction_scores
,
e_labels
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
model_outputs
,
model_labels
):
scores
,
prediction_scores
,
e_labels
=
model_outputs
labels
,
mlm_labels
,
attention_mask
=
model_labels
# 这里不适用mlm_labels,mlm_labels主要是用于generator算loss,本方法generator是不参加训练的
loss_simcse
=
F
.
cross_entropy
(
scores
,
labels
)
loss_electra
=
lambda_weight
*
F
.
cross_entropy
(
prediction_scores
.
view
(
-
1
,
2
),
e_labels
.
view
(
-
1
))
return
{
'loss'
:
loss_simcse
+
loss_electra
,
'loss_simcse'
:
loss_simcse
,
'loss_electra'
:
loss_electra
}
def
cal_metric
(
model_outputs
,
model_labels
):
scores
,
prediction_scores
,
e_labels
=
model_outputs
labels
,
mlm_labels
,
attention_mask
=
model_labels
rep
=
(
e_labels
==
1
)
*
attention_mask
fix
=
(
e_labels
==
0
)
*
attention_mask
prediction
=
prediction_scores
.
argmax
(
-
1
)
result
=
{}
result
[
'electra_rep_acc'
]
=
float
((
prediction
*
rep
).
sum
()
/
rep
.
sum
())
result
[
'electra_fix_acc'
]
=
float
(
1.0
-
(
prediction
*
fix
).
sum
()
/
fix
.
sum
())
result
[
'electra_acc'
]
=
float
(((
prediction
==
e_labels
)
*
attention_mask
).
sum
()
/
attention_mask
.
sum
())
return
result
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
7e-6
),
metrics
=
cal_metric
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
def
evaluate
(
dataloader
):
# 模型预测
# 标准化,相似度,相关系数
sims_list
,
labels
=
[],
[]
for
(
a_token_ids
,
b_token_ids
),
label
in
tqdm
(
dataloader
):
a_vecs
=
model
.
encode
(
a_token_ids
)
b_vecs
=
model
.
encode
(
b_token_ids
)
a_vecs
=
torch
.
nn
.
functional
.
normalize
(
a_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
b_vecs
=
torch
.
nn
.
functional
.
normalize
(
b_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
sims_list
.
append
(
sims
)
labels
.
append
(
label
.
cpu
().
numpy
())
corrcoef
=
scipy
.
stats
.
spearmanr
(
np
.
concatenate
(
labels
),
np
.
concatenate
(
sims_list
)).
correlation
return
corrcoef
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
5
,
callbacks
=
[
evaluator
])
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_ESimCSE.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# ESimCSE 中文测试
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | ESimCSE | 34.05 | 50.54| 71.58 | 12.53 | 71.27 |
from
bert4torch.snippets
import
sequence_padding
from
tqdm
import
tqdm
import
numpy
as
np
import
scipy.stats
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
get_pool_emb
from
torch.utils.data
import
DataLoader
from
torch
import
optim
,
nn
import
torch
import
random
import
copy
import
sys
from
bert4torch.snippets
import
ListDataset
import
jieba
jieba
.
initialize
()
class
CollateFunc
(
object
):
'''对句子进行复制,和抽取负对
'''
def
__init__
(
self
,
tokenizer
,
max_len
=
256
,
q_size
=
160
,
dup_rate
=
0.15
):
self
.
q
=
[]
self
.
q_size
=
q_size
self
.
max_len
=
max_len
self
.
dup_rate
=
dup_rate
self
.
tokenizer
=
tokenizer
def
word_repetition
(
self
,
batch_text
,
pre_tokenize
=
False
):
dst_text
=
list
()
for
text
in
batch_text
:
if
pre_tokenize
:
cut_text
=
jieba
.
cut
(
text
,
cut_all
=
False
)
text
=
list
(
cut_text
)
actual_len
=
len
(
text
)
dup_len
=
random
.
randint
(
a
=
0
,
b
=
max
(
2
,
int
(
self
.
dup_rate
*
actual_len
)))
try
:
dup_word_index
=
random
.
sample
(
list
(
range
(
1
,
actual_len
)),
k
=
dup_len
)
except
:
dup_word_index
=
set
()
dup_text
=
''
for
index
,
word
in
enumerate
(
text
):
dup_text
+=
word
if
index
in
dup_word_index
:
dup_text
+=
word
dst_text
.
append
(
dup_text
)
return
dst_text
def
negative_samples
(
self
,
batch_src_text
):
batch_size
=
len
(
batch_src_text
)
negative_samples
=
None
if
len
(
self
.
q
)
>
0
:
negative_samples
=
self
.
q
[:
self
.
q_size
]
# print("size of negative_samples", len(negative_samples))
if
len
(
self
.
q
)
+
batch_size
>=
self
.
q_size
:
del
self
.
q
[:
batch_size
]
self
.
q
.
extend
(
batch_src_text
)
return
negative_samples
def
__call__
(
self
,
batch_text
):
'''
input: batch_text: [batch_text,]
output: batch_src_text, batch_dst_text, batch_neg_text
'''
batch_pos_text
=
self
.
word_repetition
(
batch_text
)
batch_neg_text
=
self
.
negative_samples
(
batch_text
)
# print(len(batch_pos_text))
batch_tokens_list
,
batch_pos_tokens_list
=
[],
[]
for
text
,
text_pos
in
zip
(
batch_text
,
batch_pos_text
):
batch_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
])
batch_pos_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text_pos
,
maxlen
=
maxlen
)[
0
])
batch_neg_tokens_list
=
[]
if
batch_neg_text
:
for
text
in
batch_neg_text
:
batch_neg_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
])
batch_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
batch_pos_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_pos_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
batch_tokens_list
.
size
(
0
),
device
=
batch_tokens_list
.
device
)
if
batch_neg_tokens_list
:
batch_neg_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_neg_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_tokens_list
,
batch_pos_tokens_list
,
batch_neg_tokens_list
],
labels
else
:
return
[
batch_tokens_list
,
batch_pos_tokens_list
],
labels
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'STS-B', 0.3 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
train_call_func
=
CollateFunc
(
tokenizer
,
max_len
=
maxlen
,
q_size
=
64
,
dup_rate
=
0.15
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
train_call_func
)
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
encoder
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
momentum_encoder
=
copy
.
deepcopy
(
self
.
encoder
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
[:
2
]:
hidden_state1
,
pooler
=
self
.
encoder
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
if
len
(
token_ids_list
)
==
3
:
# 负样本
hidden_state1
,
pooler
=
self
.
momentum_encoder
([
token_ids_list
[
2
]])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
encoder
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
class
Momentum
(
object
):
''' 动量更新,这里用scheduler来实现,因为是在optimizer.step()后来调用的
'''
def
__init__
(
self
,
gamma
=
0.95
)
->
None
:
self
.
gamma
=
gamma
def
step
(
self
):
for
encoder_param
,
moco_encoder_param
in
zip
(
model
.
encoder
.
parameters
(),
model
.
momentum_encoder
.
parameters
()):
moco_encoder_param
.
data
=
self
.
gamma
*
moco_encoder_param
.
data
+
(
1.
-
self
.
gamma
)
*
encoder_param
.
data
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
scheduler
=
Momentum
(
gamma
=
0.95
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
def
evaluate
(
dataloader
):
# 模型预测
# 标准化,相似度,相关系数
sims_list
,
labels
=
[],
[]
for
(
a_token_ids
,
b_token_ids
),
label
in
tqdm
(
dataloader
):
a_vecs
=
model
.
encode
(
a_token_ids
)
b_vecs
=
model
.
encode
(
b_token_ids
)
a_vecs
=
torch
.
nn
.
functional
.
normalize
(
a_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
b_vecs
=
torch
.
nn
.
functional
.
normalize
(
b_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
sims_list
.
append
(
sims
)
labels
.
append
(
label
.
cpu
().
numpy
())
corrcoef
=
scipy
.
stats
.
spearmanr
(
np
.
concatenate
(
labels
),
np
.
concatenate
(
sims_list
)).
correlation
return
corrcoef
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
5
,
callbacks
=
[
evaluator
])
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_PromptBert.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# promptbert实现sentence embedding
# 官方项目:https://github.com/kongds/Prompt-BERT
# 参考项目:https://github.com/Macielyoung/sentence_representation_matching
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | PromptBert | 33.98 | 49.89| 73.18 | 13.30 | 73.42 |
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
tqdm
import
tqdm
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
ListDataset
,
sequence_padding
,
Callback
from
torch.utils.data
import
DataLoader
from
scipy.stats
import
pearsonr
,
spearmanr
import
numpy
as
np
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, task_name, dropout_rate = 'BERT', 'ATEC', 0.3 # debug使用
print
(
model_type
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
template_len
=
15
if
task_name
==
'PAWSX'
:
maxlen
=
128
+
template_len
else
:
maxlen
=
64
+
template_len
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
),
add_special_tokens
=
'[X]'
)
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
add_special_tokens
=
'[X]'
)
replace_token
=
"[X]"
mask_token
=
"[MASK]"
prompt_templates
=
[
'"{}" 的意思为[MASK]'
.
format
(
replace_token
),
'"{}"这句话的意思是[MASK]'
.
format
(
replace_token
)]
tao
=
0.05
token_dict
=
load_vocab
(
dict_path
)
compound_tokens
=
[[
len
(
token_dict
)]]
token_dict
[
'[X]'
]
=
len
(
token_dict
)
# 加载数据集
def
load_data
(
filenames
):
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
tqdm
(
f
.
readlines
(),
desc
=
'Load data'
):
cache
=
line
.
split
(
'
\t
'
)
text1
,
text2
,
label
=
cache
[
0
][:
maxlen
-
template_len
],
cache
[
1
][:
maxlen
-
template_len
],
cache
[
-
1
]
for
text
in
[
text1
,
text2
]:
sentence_pair
=
[]
for
template
in
prompt_templates
:
sent_num
=
len
(
tokenizer
.
tokenize
(
text
))
prompt_sent
=
template
.
replace
(
replace_token
,
text
)
template_sent
=
template
.
replace
(
replace_token
,
replace_token
*
sent_num
)
sentence_pair
.
extend
([
prompt_sent
,
template_sent
])
D
.
append
((
sentence_pair
,
int
(
label
)))
return
D
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
train_texts
=
load_data
(
all_names
)
valid_texts
=
list
(
zip
(
train_texts
[::
2
],
train_texts
[
1
::
2
]))
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
batch_tensor
=
[[]
for
_
in
range
(
4
)]
for
prompt_data
,
_
in
batch
:
for
i
,
item
in
enumerate
(
prompt_data
):
batch_tensor
[
i
].
append
(
tokenizer
.
encode
(
item
,
maxlen
=
maxlen
)[
0
])
for
i
,
item
in
enumerate
(
batch_tensor
):
batch_tensor
[
i
]
=
torch
.
tensor
(
sequence_padding
(
item
,
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
batch_tensor
[
0
].
size
(
0
),
device
=
device
)
return
batch_tensor
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_test
(
batch
):
text1_ids
,
text2_ids
,
labels
=
[],
[],
[]
for
text1
,
text2
in
batch
:
label
=
text1
[
-
1
]
text1
,
text2
=
text1
[
0
][
0
],
text2
[
0
][
0
]
text1_ids
.
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
text2_ids
.
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
text1_ids
=
torch
.
tensor
(
sequence_padding
(
text1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
text2_ids
=
torch
.
tensor
(
sequence_padding
(
text2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
text1_ids
,
text2_ids
],
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
valid_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_test
)
# =============================定义模型=============================
class
PromptBert
(
BaseModel
):
def
__init__
(
self
,
scale
=
20.0
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
model_name
,
dropout_rate
=
dropout_rate
,
segment_vocab_size
=
0
,
compound_tokens
=
compound_tokens
)
self
.
scale
=
scale
def
forward
(
self
,
prompt0_input
,
template0_input
,
prompt1_input
,
template1_input
):
embeddings_a
=
self
.
get_sentence_embedding
(
prompt0_input
,
template0_input
)
embeddings_b
=
self
.
get_sentence_embedding
(
prompt1_input
,
template1_input
)
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
get_sentence_embedding
(
self
,
prompt_input_ids
,
template_input_ids
):
prompt_mask_embedding
=
self
.
get_mask_embedding
(
prompt_input_ids
)
template_mask_embedding
=
self
.
get_mask_embedding
(
template_input_ids
)
# 在计算损失函数时为了消除Prompt模板影响,通过替换模板后的句子[MASK]获取的表征减去模板中[MASK]获取的表征来得到句子向量表征
sentence_embedding
=
prompt_mask_embedding
-
template_mask_embedding
return
sentence_embedding
def
get_mask_embedding
(
self
,
input_ids
):
last_hidden_state
=
self
.
bert
([
input_ids
])
mask_index
=
(
input_ids
==
tokenizer
.
_token_mask_id
).
long
()
input_mask_expanded
=
mask_index
.
unsqueeze
(
-
1
).
expand
(
last_hidden_state
.
size
()).
float
()
mask_embedding
=
torch
.
sum
(
last_hidden_state
*
input_mask_expanded
,
1
)
return
mask_embedding
def
predict
(
self
,
input_ids
):
self
.
eval
()
with
torch
.
no_grad
():
mask_embedding
=
self
.
get_mask_embedding
(
input_ids
)
return
mask_embedding
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
PromptBert
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_sim
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_sim
=
self
.
evaluate
(
valid_dataloader
)
if
val_sim
>
self
.
best_val_sim
:
self
.
best_val_sim
=
val_sim
# model.save_weights('best_model.pt')
print
(
f
'val_sim:
{
val_sim
:.
5
f
}
, best_val_sim:
{
self
.
best_val_sim
:.
5
f
}
\n
'
)
@
staticmethod
def
evaluate
(
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
text1_ids
,
text2_ids
),
label
in
data
:
embeddings1
.
append
(
model
.
predict
(
text1_ids
))
embeddings2
.
append
(
model
.
predict
(
text2_ids
))
labels
.
append
(
label
)
embeddings1
=
torch
.
cat
(
embeddings1
)
embeddings2
=
torch
.
cat
(
embeddings2
)
labels
=
torch
.
cat
(
labels
)
sims
=
F
.
cosine_similarity
(
embeddings1
,
embeddings2
).
cpu
().
numpy
()
labels
=
labels
.
cpu
().
numpy
()
return
spearmanr
(
sims
,
labels
)[
0
]
if
__name__
==
"__main__"
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_SimCSE.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# SimCSE 中文测试
# bert4keras链接:https://kexue.fm/archives/8348
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | SimCSE | 33.90 | 50.29| 71.81 | 13.14 | 71.09 |
from
bert4torch.snippets
import
sequence_padding
from
tqdm
import
tqdm
import
numpy
as
np
import
scipy.stats
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
get_pool_emb
from
torch.utils.data
import
DataLoader
from
torch
import
optim
,
nn
import
torch
from
bert4torch.snippets
import
ListDataset
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.3 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
for
text
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
texts_list
[
0
].
append
(
token_ids
)
texts_list
[
1
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pooler
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
def
evaluate
(
dataloader
):
# 模型预测
# 标准化,相似度,相关系数
sims_list
,
labels
=
[],
[]
for
(
a_token_ids
,
b_token_ids
),
label
in
tqdm
(
dataloader
):
a_vecs
=
model
.
encode
(
a_token_ids
)
b_vecs
=
model
.
encode
(
b_token_ids
)
a_vecs
=
torch
.
nn
.
functional
.
normalize
(
a_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
b_vecs
=
torch
.
nn
.
functional
.
normalize
(
b_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
sims_list
.
append
(
sims
)
labels
.
append
(
label
.
cpu
().
numpy
())
corrcoef
=
scipy
.
stats
.
spearmanr
(
np
.
concatenate
(
labels
),
np
.
concatenate
(
sims_list
)).
correlation
return
corrcoef
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
5
,
callbacks
=
[
evaluator
])
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# 一个encoder输入删减后的句子生成句向量,decoder依据这个句子向量来恢复原句
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B | comment |
# | TSDAE | —— | 46.65| 65.30 | 12.54 | —— | ——表示该指标异常未记录 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
numpy
as
np
import
re
from
tqdm
import
tqdm
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
def
add_noise
(
token_ids
,
del_ratio
=
0.6
):
n
=
len
(
token_ids
)
keep_or_not
=
np
.
random
.
rand
(
n
)
>
del_ratio
if
sum
(
keep_or_not
)
==
0
:
keep_or_not
[
np
.
random
.
choice
(
n
)]
=
True
# guarantee that at least one word remains
return
list
(
np
.
array
(
token_ids
)[
keep_or_not
])
texts_list
=
[[]
for
_
in
range
(
3
)]
for
text
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
0
].
append
([
tokenizer
.
_token_start_id
]
+
add_noise
(
token_ids
[
1
:
-
1
])
+
[
tokenizer
.
_token_end_id
])
texts_list
[
1
].
append
(
token_ids
[:
-
1
])
texts_list
[
2
].
append
(
token_ids
[
1
:])
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
return
texts_list
[:
2
],
texts_list
[
2
].
flatten
()
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
encoder
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
# 用bert的权重来初始化decoder,crossAttn部分是随机初始化的
self
.
decoder
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
model_name
,
application
=
'lm'
,
dropout_rate
=
dropout_rate
,
output_all_encoded_layers
=
output_all_encoded_layers
,
is_decoder
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
# 绑定encoder和decoder的权重
decoder_names
=
{
k
for
k
,
_
in
self
.
decoder
.
named_parameters
()}
for
enc_k
,
v
in
self
.
encoder
.
named_parameters
():
dec_k
=
enc_k
if
dec_k
in
decoder_names
:
rep_str
=
f
'self.encoder.
{
enc_k
}
= self.decoder.
{
dec_k
}
'
if
re
.
search
(
'\.[0-9]+\.'
,
rep_str
):
temp
=
'['
+
re
.
findall
(
'\.[0-9]+\.'
,
rep_str
)[
0
][
1
:
-
1
]
+
'].'
rep_str
=
re
.
sub
(
'\.[0-9]+\.'
,
temp
,
rep_str
)
exec
(
rep_str
)
else
:
print
(
enc_k
,
dec_k
)
def
forward
(
self
,
token_ids_list
):
token_ids1
=
token_ids_list
[
0
]
hidden_state1
,
pool_cls1
=
self
.
encoder
([
token_ids1
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids1
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids2
=
token_ids_list
[
1
]
encoder_embedding
=
embeddings_a
.
unsqueeze
(
1
)
encoder_attention_mask
=
torch
.
ones_like
(
token_ids1
)[:,
0
:
1
][:,
None
,
None
,
:]
_
,
logits
=
self
.
decoder
([
token_ids2
,
encoder_embedding
,
encoder_attention_mask
])
return
logits
.
reshape
(
-
1
,
logits
.
shape
[
-
1
])
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
encoder
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-4
),
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_unsup_bert_whitening.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# bert_whitening
# 官方项目:https://github.com/bojone/BERT-whitening
# cls+不降维
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | Bert-whitening | 26.79 | 31.81| 56.34 | 17.22 | 67.45 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
get_pool_emb
from
bert4torch.layers
import
BERT_WHITENING
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
scipy.stats
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
# model_type, pooling, task_name, n_components = sys.argv[1:] # 传入参数
model_type
,
pooling
,
task_name
,
n_components
=
'BERT'
,
'cls'
,
'ATEC'
,
-
1
# debug使用
print
(
model_type
,
pooling
,
task_name
,
n_components
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
n_components
=
int
(
n_components
)
if
n_components
<
0
:
if
model_type
.
endswith
(
'large'
):
n_components
=
1024
elif
model_type
.
endswith
(
'tiny'
):
n_components
=
312
elif
model_type
.
endswith
(
'small'
):
n_components
=
384
else
:
n_components
=
768
batch_size
=
128
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
# if len(D) > 1000:
# break
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
all_names
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'mean'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 提取训练集的所有句向量
sen_emb_list
,
sen_labels
=
[],
[]
for
token_ids
,
labels
in
tqdm
(
train_dataloader
,
desc
=
'Encoding'
):
sen1_emb
=
model
.
encode
(
token_ids
[
0
])
sen2_emb
=
model
.
encode
(
token_ids
[
1
])
sen_emb_list
.
append
((
sen1_emb
,
sen2_emb
))
sen_labels
.
append
(
labels
)
# 调用bert_whitening模块
bert_whitening
=
BERT_WHITENING
()
if
n_components
>
0
:
bert_whitening
.
compute_kernel_bias
([
v
for
vecs
in
sen_emb_list
for
v
in
vecs
])
bert_whitening
.
kernel
=
bert_whitening
.
kernel
[:,
:
n_components
]
# 变换,标准化,相似度,相关系数
all_sims
=
[]
for
(
a_vecs
,
b_vecs
)
in
tqdm
(
sen_emb_list
,
desc
=
'Transform'
):
a_vecs
=
bert_whitening
.
transform_and_normalize
(
a_vecs
)
b_vecs
=
bert_whitening
.
transform_and_normalize
(
b_vecs
)
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
all_sims
.
append
(
sims
)
all_sims
=
torch
.
cat
(
all_sims
,
dim
=
0
)
sen_labels
=
torch
.
cat
(
sen_labels
,
dim
=
0
)
corrcoef
=
scipy
.
stats
.
spearmanr
(
sen_labels
.
cpu
().
numpy
(),
all_sims
.
cpu
().
numpy
()).
correlation
print
(
f
'
{
task_name
}
corrcoefs: '
,
corrcoef
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/seq2seq/task_kgclue_seq2seq.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# KgCLUE baseline
# 直接用UniLM做Seq2Seq,然后前缀树约束解码,并加入自研的“前瞻”策略;
# 基础模型为RoFormer-Sim-FT,相比直接用RoFormer/BERT/RoBERTa有2%的提升;
# 介绍链接:https://kexue.fm/archives/8802
import
os
,
json
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch.optim
as
optim
import
torch.nn
as
nn
import
torch
import
torch.nn.functional
as
F
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
ListDataset
,
sequence_padding
,
AutoRegressiveDecoder
,
Callback
from
tqdm
import
tqdm
from
collections
import
defaultdict
# import pylcs
def
lcs
(
source
,
target
):
"""最长公共子序列(source和target的最长非连续子序列)
返回:子序列长度, 映射关系(映射对组成的list)
注意:最长公共子序列可能不止一个,所返回的映射只代表其中一个。
"""
c
=
defaultdict
(
int
)
for
i
,
si
in
enumerate
(
source
,
1
):
for
j
,
tj
in
enumerate
(
target
,
1
):
if
si
==
tj
:
c
[
i
,
j
]
=
c
[
i
-
1
,
j
-
1
]
+
1
elif
c
[
i
,
j
-
1
]
>
c
[
i
-
1
,
j
]:
c
[
i
,
j
]
=
c
[
i
,
j
-
1
]
else
:
c
[
i
,
j
]
=
c
[
i
-
1
,
j
]
l
,
mapping
=
c
[
len
(
source
),
len
(
target
)],
[]
i
,
j
=
len
(
source
)
-
1
,
len
(
target
)
-
1
while
len
(
mapping
)
<
l
:
if
source
[
i
]
==
target
[
j
]:
mapping
.
append
((
i
,
j
))
i
,
j
=
i
-
1
,
j
-
1
elif
c
[
i
+
1
,
j
]
>
c
[
i
,
j
+
1
]:
j
=
j
-
1
else
:
i
=
i
-
1
return
l
,
mapping
[::
-
1
]
def
subject_split
(
s
):
"""如果有义项,那么单独分离出来
"""
m
=
''
if
s
[
-
1
]
==
u
')'
:
i
=
s
.
index
(
u
'('
)
m
=
s
[
i
+
1
:
-
1
]
s
=
s
[:
i
]
return
s
,
m
def
load_data
(
filename
):
"""读取数据集
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
s
,
p
,
o
=
l
[
'answer'
].
split
(
' ||| '
)
s
,
m
=
subject_split
(
s
)
D
.
append
((
l
[
'question'
],
(
s
,
p
,
m
,
' '
.
join
(
o
.
split
()))))
return
D
class
Trie
(
object
):
"""自定义Trie树对象,用来保存知识库
"""
def
__init__
(
self
,
value_key
=-
1
):
self
.
data
=
{}
self
.
value_key
=
str
(
value_key
)
def
__setitem__
(
self
,
key
,
value
):
"""传入一对(key, value)到前缀树中
"""
data
=
self
.
data
for
k
in
key
:
k
=
str
(
k
)
if
k
not
in
data
:
data
[
k
]
=
{}
data
=
data
[
k
]
if
self
.
value_key
in
data
:
if
data
[
self
.
value_key
]
!=
value
:
data
[
self
.
value_key
]
+=
(
'
\t
'
+
value
)
else
:
data
[
self
.
value_key
]
=
value
def
__getitem__
(
self
,
key
):
"""获取key对应的value
"""
data
=
self
.
data
for
k
in
key
:
k
=
str
(
k
)
data
=
data
[
k
]
return
data
[
self
.
value_key
]
def
next_ones
(
self
,
prefix
):
"""获取prefix后一位的容许集
"""
data
=
self
.
data
for
k
in
prefix
:
k
=
str
(
k
)
data
=
data
[
k
]
return
[
k
for
k
in
data
if
k
!=
self
.
value_key
]
def
keys
(
self
,
prefix
=
None
,
data
=
None
):
"""获取以prefix开头的所有key
"""
data
=
data
or
self
.
data
prefix
=
prefix
or
[]
for
k
in
prefix
:
k
=
str
(
k
)
if
k
not
in
data
:
return
[]
data
=
data
[
k
]
results
=
[]
for
k
in
data
:
if
k
==
self
.
value_key
:
results
.
append
([])
else
:
results
.
extend
([[
k
]
+
j
for
j
in
self
.
keys
(
None
,
data
[
k
])])
return
[
prefix
+
i
for
i
in
results
]
def
save
(
self
,
filename
):
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
self
.
data
,
f
,
ensure_ascii
=
False
)
def
load
(
self
,
filename
):
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
self
.
data
=
json
.
load
(
f
)
# 基本参数
maxlen
=
128
batch_size
=
32
epochs
=
10
# 模型路径
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 转换知识库
KG
=
Trie
()
if
os
.
path
.
exists
(
'../datasets/KG.json'
):
KG
.
load
(
'../datasets/KG.json'
)
else
:
with
open
(
'F:/Projects/data/corpus/kg/KgCLUE/Knowledge_20211215.txt'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
# count = 0
for
l
in
tqdm
(
f
):
s
,
p
,
o
=
l
.
split
(
'
\t
'
)
s
,
m
=
subject_split
(
s
)
ids
=
tokenizer
.
encode
(
s
,
p
)[
0
][
1
:]
ids
+=
tokenizer
.
encode
(
m
)[
0
][
1
:
-
1
]
KG
[
ids
]
=
' '
.
join
(
o
.
split
())
# count += 1
# if count > 10000:
# break
KG
.
save
(
'../datasets/KG.json'
)
def
collate_fn
(
batch
):
"""数据生成器
单条样本:[CLS] Q [SEP] S [SEP] P [SEP] M [SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
(
q
,
a
)
in
batch
:
q_ids
=
tokenizer
.
encode
(
q
,
maxlen
=
maxlen
//
2
+
1
)[
0
]
a_ids
=
tokenizer
.
encode
(
a
[
0
],
a
[
1
])[
0
]
a_ids
+=
tokenizer
.
encode
(
a
[
2
])[
0
][
1
:]
token_ids
=
(
q_ids
+
a_ids
[
1
:])[:
maxlen
]
segment_ids
=
[
0
]
*
len
(
q_ids
)
segment_ids
+=
[
1
]
*
(
len
(
token_ids
)
-
len
(
q_ids
))
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
# 读取数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/train.json'
)
train_dataloader
=
DataLoader
(
ListDataset
(
train_data
),
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/dev.json'
)
test_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/test_public.json'
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'roformer'
,
application
=
'unilm'
).
to
(
device
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
5e-6
))
class
AutoQA
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
all_token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
all_token_ids
,
segment_ids
])
probas
=
F
.
softmax
(
y_pred
[:,
-
1
,
:],
dim
=-
1
)
new_probas
=
torch
.
zeros_like
(
probas
)
for
i
,
ids
in
enumerate
(
output_ids
):
ids
=
ids
.
cpu
().
numpy
()
next_ids
=
[
int
(
j
)
for
j
in
KG
.
next_ones
(
ids
)]
# 下一位容许集
# ===========如果t时刻为Pt的前缀树中的短句,带来的信息增益越大,则增加Pt的概率
if
len
(
next_ids
)
>
1
and
self
.
end_id
in
ids
:
# 容许集大于1且已解码出S
candidates
=
KG
.
keys
(
list
(
ids
))
# 可能解码结果
weights
=
torch
.
ones_like
(
probas
[
i
])
# 默认权重为1
lcs0
=
lcs
(
ids
,
token_ids
[
i
])[
0
]
# 当前已经覆盖的token数
for
c
in
candidates
:
if
len
(
c
)
>
len
(
ids
):
c
=
[
int
(
j
)
for
j
in
c
]
w
=
lcs
(
c
,
token_ids
[
i
])[
0
]
-
lcs0
# 未来还可能覆盖的token数
weights
[
c
[
len
(
ids
)]]
=
max
(
w
+
1
,
weights
[
c
[
len
(
ids
)]].
cpu
().
numpy
())
probas
[
i
]
=
torch
.
pow
(
probas
[
i
],
1.
/
weights
)
# 按 p^(1/n) 来增大权重
if
not
next_ids
:
# 如果容许集为空,意味着要结束了
next_ids
.
append
(
self
.
end_id
)
new_probas
[
i
,
next_ids
]
+=
probas
[
i
,
next_ids
]
# 只保留容许集概率
new_probas
/=
new_probas
.
sum
(
axis
=
1
,
keepdims
=
True
)
# 重新归一化
return
new_probas
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
,
min_ends
=
3
)
# 基于beam search
end_idxs
=
[
i
for
i
,
j
in
enumerate
(
output_ids
)
if
j
==
self
.
end_id
]
subject_ids
=
output_ids
[:
end_idxs
[
0
]]
predicate_ids
=
output_ids
[
end_idxs
[
0
]:
end_idxs
[
1
]]
meaning_ids
=
output_ids
[
end_idxs
[
1
]:]
return
(
tokenizer
.
decode
(
subject_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
predicate_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
meaning_ids
.
cpu
().
numpy
()),
KG
[
output_ids
[:
-
1
].
cpu
().
numpy
()]
)
autoqa
=
AutoQA
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_score
=
0
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
em
,
f1
,
score
=
self
.
evaluate
(
valid_data
,
topk
=
3
)
if
score
>=
self
.
best_score
:
self
.
best_score
=
score
# model.save_weights('./best_model.weights')
print
(
u
'[VALID] em: %.5f, f1: %.5f, score: %.5f, best_score: %.5f
\n
'
%
(
em
,
f1
,
score
,
self
.
best_score
)
)
def
f1sim
(
self
,
text_a
,
text_b
):
"""计算两个文本之间的f1相似度
说明:算出两个文本的最长公共子序列长度,然后乘2并处以两者
长度之和。推荐用pylcs算,速度较快。
"""
if
not
text_a
and
not
text_b
:
return
0.
else
:
lcs_len
=
lcs
(
text_a
,
text_b
)[
0
]
return
2.
*
lcs_len
/
(
len
(
text_a
)
+
len
(
text_b
))
def
evaluate
(
self
,
data
,
topk
=
1
):
"""评估函数
注意:同一(S, P)对应的O可能有多个,但标注数据只保留了
一个,为了跟标注数据对齐来提高分数,这里也只保留第一个。
"""
em
,
f1
,
total
=
0.
,
0.
,
0.
for
d
in
tqdm
(
data
,
ncols
=
0
):
a
=
autoqa
.
generate
(
d
[
0
],
topk
=
topk
)
o
=
a
[
3
].
split
(
'
\t
'
)[
0
]
# 如果有多个,只保留第一个
em
+=
float
(
o
==
d
[
1
][
3
])
f1
+=
self
.
f1sim
(
o
,
d
[
1
][
3
])
total
+=
1
em
/=
total
f1
/=
total
return
em
,
f1
,
(
em
+
f1
)
/
2
def
test_predict
(
in_file
,
out_file
,
topk
=
1
):
"""输出测试结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw
=
open
(
out_file
,
'w'
)
with
open
(
in_file
)
as
fr
:
for
l
in
tqdm
(
fr
):
l
=
json
.
loads
(
l
)
s
,
p
,
m
,
o
=
autoqa
.
generate
(
l
[
'question'
],
topk
=
topk
)
if
m
:
s
+=
u
'(%s)'
%
m
l
[
'answer'
]
=
'%s ||| %s ||| %s'
%
(
s
,
p
,
o
.
split
(
'
\t
'
)[
0
])
l
=
json
.
dumps
(
l
,
ensure_ascii
=
False
)
fw
.
write
(
l
+
'
\n
'
)
fw
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
model
.
load_weights
(
'./best_model.weights'
)
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
1
)
print
(
u
'[TEST] topk=1, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
3
)
print
(
u
'[TEST] topk=3, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
5
)
print
(
u
'[TEST] topk=5, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
else
:
model
.
load_weights
(
'./best_model.weights'
)
# test_predict('../datasets/test.json', 'kgclue_predict.json', topk=3)
\ No newline at end of file
bert/bert4torch_cmcc/examples/seq2seq/task_question_answer_generation_by_seq2seq.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8- -*-
# 用Seq2Seq做阅读理解构建
# 根据篇章先采样生成答案,然后采样生成问题
# 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
numpy
as
np
# 基本参数
max_p_len
=
128
max_q_len
=
64
max_a_len
=
16
batch_size
=
24
epochs
=
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 筛选数据
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
data
=
[]
for
d
in
webqa_data
+
sogou_data
:
for
p
in
d
[
'passages'
]:
if
p
[
'answer'
]:
for
t
in
text_segmentate
(
p
[
'passage'
],
max_p_len
-
2
,
seps
,
strips
):
if
p
[
'answer'
]
in
t
:
data
.
append
((
t
,
d
[
'question'
],
p
[
'answer'
]))
del
webqa_data
del
sogou_data
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
10
!=
0
]
valid_data
=
[
data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
10
==
0
]
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
,
'w'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json'
,
'w'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
(
p
,
q
,
a
)
in
batch
:
p_token_ids
,
_
=
tokenizer
.
encode
(
p
,
maxlen
=
max_p_len
+
1
)
a_token_ids
,
_
=
tokenizer
.
encode
(
a
,
maxlen
=
max_a_len
)
q_token_ids
,
_
=
tokenizer
.
encode
(
q
,
maxlen
=
max_q_len
)
token_ids
=
p_token_ids
+
a_token_ids
[
1
:]
+
q_token_ids
[
1
:]
# 去掉answer和question的cls位
segment_ids
=
[
0
]
*
len
(
p_token_ids
)
segment_ids
+=
[
1
]
*
(
len
(
token_ids
)
-
len
(
p_token_ids
))
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
QuestionAnswerGeneration
(
AutoRegressiveDecoder
):
"""随机生成答案,并且通过beam search来生成问题
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
passage
,
topk
=
1
,
topp
=
0.95
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
)
a_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
1
,
topp
=
topp
)[
0
]
# 基于随机采样
token_ids
+=
list
(
a_ids
)
segment_ids
+=
[
1
]
*
len
(
a_ids
)
q_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
(
tokenizer
.
decode
(
q_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
a_ids
.
cpu
().
numpy
()))
qag
=
QuestionAnswerGeneration
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_q_len
,
device
=
device
)
def
predict_to_file
(
data
,
filename
,
topk
=
1
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q
,
a
=
qag
.
generate
(
d
[
0
])
s
=
'%s
\t
%s
\t
%s
\n
'
%
(
q
,
a
,
d
[
0
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
# predict_to_file(valid_data, 'qa.csv')
bert/bert4torch_cmcc/examples/seq2seq/task_reading_comprehension_by_mlm.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 用MLM的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
re
import
torch.nn.functional
as
F
# 基本参数
max_p_len
=
256
max_q_len
=
64
max_a_len
=
32
batch_size
=
12
epochs
=
10
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
sogou_data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
!=
0
]
valid_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
==
0
]
train_data
.
extend
(
train_data
)
train_data
.
extend
(
webqa_data
)
# 将SogouQA和WebQA按2:1的比例混合
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
,
'[MASK]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式为
输入: [CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
输出: 答案
"""
batch_token_ids
,
batch_segment_ids
,
batch_a_token_ids
=
[],
[],
[]
for
D
in
batch
:
question
=
D
[
'question'
]
answers
=
[
p
[
'answer'
]
for
p
in
D
[
'passages'
]
if
p
[
'answer'
]]
passage
=
np
.
random
.
choice
(
D
[
'passages'
])[
'passage'
]
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
final_answer
=
''
for
answer
in
answers
:
if
all
([
a
in
passage
[:
max_p_len
-
2
]
for
a
in
answer
.
split
(
' '
)]):
final_answer
=
answer
.
replace
(
' '
,
','
)
break
a_token_ids
,
_
=
tokenizer
.
encode
(
final_answer
,
maxlen
=
max_a_len
+
1
)
q_token_ids
,
_
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)
p_token_ids
,
_
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
token_ids
=
[
tokenizer
.
_token_start_id
]
token_ids
+=
([
tokenizer
.
_token_mask_id
]
*
max_a_len
)
token_ids
+=
[
tokenizer
.
_token_end_id
]
token_ids
+=
(
q_token_ids
[
1
:]
+
p_token_ids
[
1
:])
segment_ids
=
[
0
]
*
len
(
token_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_a_token_ids
.
append
(
a_token_ids
[
1
:])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_a_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_a_token_ids
,
max_a_len
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_a_token_ids
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, max_a_len]
'''
_
,
y_pred
=
outputs
y_pred
=
y_pred
[:,
1
:
max_a_len
+
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
y_true
.
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
def
get_ngram_set
(
x
,
n
):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result
=
{}
for
i
in
range
(
len
(
x
)
-
n
+
1
):
k
=
tuple
(
x
[
i
:
i
+
n
])
if
k
[:
-
1
]
not
in
result
:
result
[
k
[:
-
1
]]
=
set
()
result
[
k
[:
-
1
]].
add
(
k
[
-
1
])
return
result
def
gen_answer
(
question
,
passages
):
"""由于是MLM模型,所以可以直接argmax解码。
"""
all_p_token_ids
,
token_ids
,
segment_ids
=
[],
[],
[]
for
passage
in
passages
:
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
p_token_ids
,
_
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
q_token_ids
,
_
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)
all_p_token_ids
.
append
(
p_token_ids
[
1
:])
token_ids
.
append
([
tokenizer
.
_token_start_id
])
token_ids
[
-
1
]
+=
([
tokenizer
.
_token_mask_id
]
*
max_a_len
)
token_ids
[
-
1
]
+=
[
tokenizer
.
_token_end_id
]
token_ids
[
-
1
]
+=
(
q_token_ids
[
1
:]
+
p_token_ids
[
1
:])
segment_ids
.
append
([
0
]
*
len
(
token_ids
[
-
1
]))
token_ids
=
torch
.
tensor
(
sequence_padding
(
token_ids
),
device
=
device
)
segment_ids
=
torch
.
tensor
(
sequence_padding
(
segment_ids
),
device
=
device
)
logit
=
model
.
predict
([
token_ids
,
segment_ids
])[
-
1
][:,
1
:
max_a_len
+
1
,
:]
probas
=
F
.
softmax
(
logit
,
dim
=-
1
)
results
=
{}
for
t
,
p
in
zip
(
all_p_token_ids
,
probas
):
a
,
score
=
tuple
(),
0.
for
i
in
range
(
max_a_len
):
idxs
=
list
(
get_ngram_set
(
t
,
i
+
1
)[
a
])
if
tokenizer
.
_token_end_id
not
in
idxs
:
idxs
.
append
(
tokenizer
.
_token_end_id
)
# pi是将passage以外的token的概率置零
pi
=
torch
.
zeros_like
(
p
[
i
])
pi
[
idxs
]
=
p
[
i
,
idxs
]
a
=
a
+
(
pi
.
argmax
().
item
(),)
score
+=
pi
.
max
().
item
()
if
a
[
-
1
]
==
tokenizer
.
_token_end_id
:
break
score
=
score
/
(
i
+
1
)
a
=
tokenizer
.
decode
(
a
)
if
a
:
results
[
a
]
=
results
.
get
(
a
,
[])
+
[
score
]
results
=
{
k
:
(
np
.
array
(
v
)
**
2
).
sum
()
/
(
sum
(
v
)
+
1
)
for
k
,
v
in
results
.
items
()
}
return
results
def
max_in_dict
(
d
):
if
d
:
return
sorted
(
d
.
items
(),
key
=
lambda
s
:
-
s
[
1
])[
0
][
0
]
def
predict_to_file
(
data
,
filename
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q_text
=
d
[
'question'
]
p_texts
=
[
p
[
'passage'
]
for
p
in
d
[
'passages'
]]
a
=
gen_answer
(
q_text
,
p_texts
)
a
=
max_in_dict
(
a
)
if
a
:
s
=
u
'%s
\t
%s
\n
'
%
(
d
[
'id'
],
a
)
else
:
s
=
u
'%s
\t\n
'
%
(
d
[
'id'
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
# predict_to_file(valid_data, 'qa.csv')
bert/bert4torch_cmcc/examples/seq2seq/task_reading_comprehension_by_seq2seq.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 用seq2seq的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
re
# 基本参数
max_p_len
=
256
max_q_len
=
64
max_a_len
=
32
max_qa_len
=
max_q_len
+
max_a_len
batch_size
=
8
epochs
=
10
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
sogou_data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
!=
0
]
valid_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
==
0
]
train_data
.
extend
(
train_data
)
train_data
.
extend
(
webqa_data
)
# 将SogouQA和WebQA按2:1的比例混合
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式: [CLS]篇章[SEP]问题[SEP]答案[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
D
in
batch
:
question
=
D
[
'question'
]
answers
=
[
p
[
'answer'
]
for
p
in
D
[
'passages'
]
if
p
[
'answer'
]]
passage
=
np
.
random
.
choice
(
D
[
'passages'
])[
'passage'
]
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
final_answer
=
''
for
answer
in
answers
:
if
all
([
a
in
passage
[:
max_p_len
-
2
]
for
a
in
answer
.
split
(
' '
)]):
final_answer
=
answer
.
replace
(
' '
,
','
)
break
qa_token_ids
,
qa_segment_ids
=
tokenizer
.
encode
(
question
,
final_answer
,
maxlen
=
max_qa_len
+
1
)
p_token_ids
,
p_segment_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
token_ids
=
p_token_ids
+
qa_token_ids
[
1
:]
segment_ids
=
p_segment_ids
+
qa_segment_ids
[
1
:]
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
ReadingComprehension
(
AutoRegressiveDecoder
):
"""beam search解码来生成答案
passages为多篇章组成的list,从多篇文章中自动决策出最优的答案,
如果没答案,则返回空字符串。
mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。
"""
def
__init__
(
self
,
mode
=
'extractive'
,
**
kwargs
):
super
(
ReadingComprehension
,
self
).
__init__
(
**
kwargs
)
self
.
mode
=
mode
def
get_ngram_set
(
self
,
x
,
n
):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result
=
{}
for
i
in
range
(
len
(
x
)
-
n
+
1
):
k
=
tuple
(
x
[
i
:
i
+
n
])
if
k
[:
-
1
]
not
in
result
:
result
[
k
[:
-
1
]]
=
set
()
result
[
k
[:
-
1
]].
add
(
k
[
-
1
])
return
result
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
,
use_states
=
True
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
inputs
=
[
i
for
i
in
inputs
if
i
[
0
,
0
].
item
()
>
-
1
]
# 过滤掉无答案篇章
topk
=
len
(
inputs
[
0
])
all_token_ids
,
all_segment_ids
=
[],
[]
for
token_ids
in
inputs
:
# inputs里每个元素都代表一个篇章
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
zeros_like
(
token_ids
)
if
states
>
0
:
segment_ids
[:,
-
output_ids
.
shape
[
1
]:]
=
1
all_token_ids
.
extend
(
token_ids
)
all_segment_ids
.
extend
(
segment_ids
)
padded_all_token_ids
=
sequence_padding
(
all_token_ids
)
padded_all_segment_ids
=
sequence_padding
(
all_segment_ids
)
_
,
logits
=
model
.
predict
([
padded_all_token_ids
,
padded_all_segment_ids
])
probas
=
nn
.
Softmax
(
dim
=-
1
)(
logits
)
# 这里改成用torch.gather来做了
# probas = [probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids)]
# probas = torch.stack(probas).reshape((len(inputs), topk, -1))
index_
=
torch
.
tensor
([[
len
(
i
)
-
1
]
for
i
in
all_token_ids
],
device
=
probas
.
device
).
view
(
-
1
,
1
,
1
).
expand
(
-
1
,
1
,
probas
.
shape
[
-
1
])
probas
=
torch
.
gather
(
probas
,
dim
=
1
,
index
=
index_
).
reshape
((
len
(
inputs
),
topk
,
-
1
))
if
states
==
0
:
# 这一步主要是排除没有答案的篇章
# 如果一开始最大值就为end_id,那说明该篇章没有答案
argmax
=
probas
[:,
0
].
argmax
(
dim
=
1
)
available_idxs
=
torch
.
where
(
argmax
!=
self
.
end_id
)[
0
]
if
len
(
available_idxs
)
==
0
:
scores
=
torch
.
zeros_like
(
probas
[
0
])
scores
[:,
self
.
end_id
]
=
1
return
scores
,
states
+
1
else
:
for
i
in
torch
.
where
(
argmax
==
self
.
end_id
)[
0
]:
inputs
[
i
][:,
0
]
=
-
1
# 无答案篇章首位标记为-1
probas
=
probas
[
available_idxs
]
inputs
=
[
i
for
i
in
inputs
if
i
[
0
,
0
]
>
-
1
]
# 过滤掉无答案篇章
if
self
.
mode
==
'extractive'
:
# 如果是抽取式,那么答案必须是篇章的一个片段
# 那么将非篇章片段的概率值全部置0
new_probas
=
torch
.
zeros_like
(
probas
)
ngrams
=
{}
for
token_ids
in
inputs
:
token_ids
=
token_ids
[
0
]
sep_idx
=
torch
.
where
(
token_ids
==
tokenizer
.
_token_end_id
)[
0
][
0
]
p_token_ids
=
token_ids
[
1
:
sep_idx
]
for
k
,
v
in
self
.
get_ngram_set
(
p_token_ids
.
cpu
().
numpy
(),
states
+
1
).
items
():
# 这里要放到.cpu().numpy(),否则会出现nrams.get不到
ngrams
[
k
]
=
ngrams
.
get
(
k
,
set
())
|
v
for
i
,
ids
in
enumerate
(
output_ids
):
available_idxs
=
ngrams
.
get
(
tuple
(
ids
.
cpu
().
numpy
()),
set
())
available_idxs
.
add
(
tokenizer
.
_token_end_id
)
available_idxs
=
list
(
available_idxs
)
new_probas
[:,
i
,
available_idxs
]
=
probas
[:,
i
,
available_idxs
]
probas
=
new_probas
return
(
probas
**
2
).
sum
(
0
)
/
(
probas
.
sum
(
0
)
+
1
),
states
+
1
# 某种平均投票方式
def
answer
(
self
,
question
,
passages
,
topk
=
1
):
token_ids
=
[]
for
passage
in
passages
:
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
p_token_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
)[
0
]
q_token_ids
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)[
0
]
token_ids
.
append
(
p_token_ids
+
q_token_ids
[
1
:])
output_ids
=
self
.
beam_search
(
token_ids
,
topk
=
topk
,
states
=
0
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
reader
=
ReadingComprehension
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_a_len
,
mode
=
'extractive'
,
device
=
device
)
def
predict_to_file
(
data
,
filename
,
topk
=
1
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q_text
=
d
[
'question'
]
p_texts
=
[
p
[
'passage'
]
for
p
in
d
[
'passages'
]]
a
=
reader
.
answer
(
q_text
,
p_texts
,
topk
)
if
a
:
s
=
u
'%s
\t
%s
\n
'
%
(
d
[
'id'
],
a
)
else
:
s
=
u
'%s
\t\n
'
%
(
d
[
'id'
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
predict_to_file
(
valid_dataset
.
data
,
'qa.csv'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 用Seq2Seq做小学数学应用题
# 数据集为ape210k:https://github.com/Chenny0808/ape210k
# 介绍链接:https://kexue.fm/archives/7809
from
__future__
import
division
import
json
,
re
from
tqdm
import
tqdm
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
torch
import
nn
,
optim
import
torch
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.snippets
import
AutoRegressiveDecoder
from
sympy
import
Integer
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
# 基本参数
maxlen
=
192
batch_size
=
16
epochs
=
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
is_equal
(
a
,
b
):
"""比较两个结果是否相等
"""
a
=
round
(
float
(
a
),
6
)
b
=
round
(
float
(
b
),
6
)
return
a
==
b
def
remove_bucket
(
equation
):
"""去掉冗余的括号
"""
l_buckets
,
buckets
=
[],
[]
for
i
,
c
in
enumerate
(
equation
):
if
c
==
'('
:
l_buckets
.
append
(
i
)
elif
c
==
')'
:
buckets
.
append
((
l_buckets
.
pop
(),
i
))
eval_equation
=
eval
(
equation
)
for
l
,
r
in
buckets
:
new_equation
=
'%s %s %s'
%
(
equation
[:
l
],
equation
[
l
+
1
:
r
],
equation
[
r
+
1
:])
try
:
if
is_equal
(
eval
(
new_equation
.
replace
(
' '
,
''
)),
eval_equation
):
equation
=
new_equation
except
:
pass
return
equation
.
replace
(
' '
,
''
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取训练数据,并做一些标准化,保证equation是可以eval的
参考:https://kexue.fm/archives/7809
"""
D
=
[]
for
l
in
open
(
filename
,
'r'
,
encoding
=
'utf-8'
):
l
=
json
.
loads
(
l
)
question
,
equation
,
answer
=
l
[
'original_text'
],
l
[
'equation'
],
l
[
'ans'
]
# 处理带分数
question
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
question
)
equation
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
equation
)
answer
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
answer
)
equation
=
re
.
sub
(
'(\d+)\('
,
'
\\
1+('
,
equation
)
answer
=
re
.
sub
(
'(\d+)\('
,
'
\\
1+('
,
answer
)
# 分数去括号
question
=
re
.
sub
(
'\((\d+/\d+)\)'
,
'
\\
1'
,
question
)
# 处理百分数
equation
=
re
.
sub
(
'([\.\d]+)%'
,
'(
\\
1/100)'
,
equation
)
answer
=
re
.
sub
(
'([\.\d]+)%'
,
'(
\\
1/100)'
,
answer
)
# 冒号转除号、剩余百分号处理
equation
=
equation
.
replace
(
':'
,
'/'
).
replace
(
'%'
,
'/100'
)
answer
=
answer
.
replace
(
':'
,
'/'
).
replace
(
'%'
,
'/100'
)
if
equation
[:
2
]
==
'x='
:
equation
=
equation
[
2
:]
try
:
if
is_equal
(
eval
(
equation
),
eval
(
answer
)):
D
.
append
((
question
,
remove_bucket
(
equation
),
answer
))
except
:
continue
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
question
,
equation
,
answer
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
question
,
equation
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/ape210k/train.ape.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/ape210k/valid.ape.json'
)
# valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/ape210k/test.ape.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
AutoSolve
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
()).
replace
(
' '
,
''
)
autosolve
=
AutoSolve
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
64
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_acc
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
200
])
# 评测模型
if
metrics
[
'acc'
]
>=
self
.
best_acc
:
self
.
best_acc
=
metrics
[
'acc'
]
# model.save_weights('./best_model_math.pt') # 保存模型
metrics
[
'best_acc'
]
=
self
.
best_acc
print
(
'valid_data:'
,
metrics
)
print
()
def
evaluate
(
self
,
data
,
topk
=
1
):
total
,
right
=
0.0
,
0.0
for
question
,
equation
,
answer
in
tqdm
(
data
,
desc
=
'Evaluate'
):
total
+=
1
pred_equation
=
autosolve
.
generate
(
question
,
topk
)
try
:
right
+=
int
(
is_equal
(
eval
(
pred_equation
),
eval
(
answer
)))
except
:
pass
return
{
'acc'
:
right
/
total
}
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
500
,
epochs
=
epochs
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.weights'
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
glob
# 基本参数
maxlen
=
256
batch_size
=
16
epochs
=
10000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
txt
in
batch
:
text
=
open
(
txt
,
encoding
=
'utf-8'
).
read
()
text
=
text
.
split
(
'
\n
'
)
if
len
(
text
)
>
1
:
title
=
text
[
0
]
content
=
'
\n
'
.
join
(
text
[
1
:])
token_ids
,
segment_ids
=
tokenizer
.
encode
(
content
,
title
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
ListDataset
(
glob
.
glob
(
'F:/Projects/data/corpus/sentence_classification/THUCNews/*/*.txt'
)),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
,
topp
=
0.95
):
max_c_len
=
maxlen
-
self
.
maxlen
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
def
just_show
():
s1
=
u
'夏天来临,皮肤在强烈紫外线的照射下,晒伤不可避免,因此,晒后及时修复显得尤为重要,否则可能会造成长期伤害。专家表示,选择晒后护肤品要慎重,芦荟凝胶是最安全,有效的一种选择,晒伤严重者,还请及 时 就医 。'
s2
=
u
'8月28日,网络爆料称,华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看,数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录,住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证,认为数据真实性非常高。当天下午 ,华 住集 团发声明称,已在内部迅速开展核查,并第一时间报警。当晚,上海警方消息称,接到华住集团报案,警方已经介入调查。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
just_show
()
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用BART方案
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
ListDataset
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
from
tqdm
import
tqdm
import
json
from
rouge
import
Rouge
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
([
tokenizer
.
_token_end_id
]
+
token_ids
)
# 预训练时候是用[SEP]开头的
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
test_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'bart'
,
keep_tokens
=
keep_tokens
,
segment_vocab_size
=
0
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1.5e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_end_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
)
# 评测模型
metrics_test
=
self
.
evaluate
(
test_dataset
.
data
)
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
print
(
'test_data:'
,
metrics_test
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
just_show
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 微调多国语言版T5做Seq2Seq任务
# 介绍链接:https://kexue.fm/archives/7867
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
# mt5主要特点:gated-gelu, decoder的最后的dense层独立权重,rmsnorm
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
SpTokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
token_pad_ids
=
-
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/pytorch_model.bin'
# 下面两个config是从bert4keras中拿的,项目连接https://github.com/bojone/t5_in_bert4keras
spm_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn.model'
keep_tokens_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn_keep_tokens.json'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
tokenizer
=
SpTokenizer
(
spm_path
,
token_start
=
None
,
token_end
=
'</s>'
)
keep_tokens
=
json
.
load
(
open
(
keep_tokens_path
))
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
([
0
]
+
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
,
value
=
token_pad_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
,
value
=
token_pad_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
test_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'mt5.1.1'
,
segment_vocab_size
=
0
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
token_pad_ids
=
token_pad_ids
,
# 也可以指定custom_attention_mask并传入attention_mask来实现
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
token_pad_ids
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
([
int
(
i
)
for
i
in
output_ids
.
cpu
().
numpy
()])
autotitle
=
AutoTitle
(
start_id
=
0
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
)
# 评测模型
metrics_test
=
self
.
evaluate
(
test_dataset
.
data
)
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
print
(
'test_data:'
,
metrics_test
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
print
(
u
'生成标题:'
,
autotitle
.
generate
(
u
'中国的首都是extra0京'
))
# 和huggingface的结果一致
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 微调T5 PEGASUS做Seq2Seq任务, 使用到是BertTokenizer
# 介绍链接:https://kexue.fm/archives/8209
# 权重转换脚本: https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_t5_pegasus.py
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
import
jieba
jieba
.
initialize
()
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
# bert配置
pretrain_model
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path
=
pretrain_model
+
'config.json'
checkpoint_path
=
pretrain_model
+
'pytorch_model.bin'
dict_path
=
pretrain_model
+
'vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
cut
(
s
,
HMM
=
False
)
)
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
(
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
test_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'mt5.1.1'
,
segment_vocab_size
=
0
,
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
([
int
(
i
)
for
i
in
output_ids
.
cpu
().
numpy
()])
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
)
# 评测模型
metrics_test
=
self
.
evaluate
(
test_dataset
.
data
)
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
print
(
'test_data:'
,
metrics_test
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
print
(
u
'生成标题:'
,
autotitle
.
generate
(
u
'今天天气不错啊'
))
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# 微调uer版T5做Seq2Seq任务
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
json
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
(
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
test_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
't5.1.0'
,
segment_vocab_size
=
0
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
)
# 评测模型
metrics_test
=
self
.
evaluate
(
test_dataset
.
data
)
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
print
(
'test_data:'
,
metrics_test
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
print
(
u
'生成标题:'
,
autotitle
.
generate
(
'中国的首都是extra0京'
))
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
0 → 100644
View file @
92c75df1
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
maxlen
=
256
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]文章[SEP]标题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
content
,
title
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
test_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_test.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
,
topp
=
0.95
):
max_c_len
=
maxlen
-
self
.
maxlen
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
)
# 评测模型
metrics_test
=
self
.
evaluate
(
test_dataset
.
data
)
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
print
(
'test_data:'
,
metrics_test
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
just_show
()
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment