Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenzk
bert4torch_pytorch
Commits
66a1d0d0
Commit
66a1d0d0
authored
Aug 22, 2023
by
yangzhong
Browse files
提交初版bert4torch project
parents
Changes
160
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3972 additions
and
0 deletions
+3972
-0
examples/sentence_embedding/task_sentence_embedding_sup_CoSENT.py
.../sentence_embedding/task_sentence_embedding_sup_CoSENT.py
+145
-0
examples/sentence_embedding/task_sentence_embedding_sup_ContrastiveLoss.py
..._embedding/task_sentence_embedding_sup_ContrastiveLoss.py
+139
-0
examples/sentence_embedding/task_sentence_embedding_sup_CosineMSELoss.py
...ce_embedding/task_sentence_embedding_sup_CosineMSELoss.py
+137
-0
examples/sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
...sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
+204
-0
examples/sentence_embedding/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
...ng/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
+167
-0
examples/sentence_embedding/task_sentence_embedding_unsup_CT.py
...es/sentence_embedding/task_sentence_embedding_unsup_CT.py
+212
-0
examples/sentence_embedding/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
...ng/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
+207
-0
examples/sentence_embedding/task_sentence_embedding_unsup_ESimCSE.py
...ntence_embedding/task_sentence_embedding_unsup_ESimCSE.py
+269
-0
examples/sentence_embedding/task_sentence_embedding_unsup_PromptBert.py
...nce_embedding/task_sentence_embedding_unsup_PromptBert.py
+213
-0
examples/sentence_embedding/task_sentence_embedding_unsup_SimCSE.py
...entence_embedding/task_sentence_embedding_unsup_SimCSE.py
+189
-0
examples/sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
...sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
+221
-0
examples/sentence_embedding/task_sentence_embedding_unsup_bert_whitening.py
...embedding/task_sentence_embedding_unsup_bert_whitening.py
+158
-0
examples/seq2seq/task_kgclue_seq2seq.py
examples/seq2seq/task_kgclue_seq2seq.py
+356
-0
examples/seq2seq/task_question_answer_generation_by_seq2seq.py
...les/seq2seq/task_question_answer_generation_by_seq2seq.py
+192
-0
examples/seq2seq/task_reading_comprehension_by_mlm.py
examples/seq2seq/task_reading_comprehension_by_mlm.py
+237
-0
examples/seq2seq/task_reading_comprehension_by_seq2seq.py
examples/seq2seq/task_reading_comprehension_by_seq2seq.py
+268
-0
examples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
examples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
+201
-0
examples/seq2seq/task_seq2seq_autotitle.py
examples/seq2seq/task_seq2seq_autotitle.py
+139
-0
examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
+152
-0
examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
+166
-0
No files found.
examples/sentence_embedding/task_sentence_embedding_sup_CoSENT.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# 原项目:https://kexue.fm/archives/8847
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text1
,
text2
,
label
in
batch
:
for
text
in
[
text1
,
text2
]:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token_ids
):
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
sem_emb
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
sem_emb
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
y_pred
,
y_true
):
# 1. 取出真实的标签
y_true
=
y_true
[::
2
]
# tensor([1, 0, 1]) 真实的标签
# 2. 对输出的句子向量进行l2归一化 后面只需要对应为相乘 就可以得到cos值了
norms
=
(
y_pred
**
2
).
sum
(
axis
=
1
,
keepdims
=
True
)
**
0.5
# y_pred = y_pred / torch.clip(norms, 1e-8, torch.inf)
y_pred
=
y_pred
/
norms
# 3. 奇偶向量相乘
y_pred
=
torch
.
sum
(
y_pred
[::
2
]
*
y_pred
[
1
::
2
],
dim
=
1
)
*
20
# 4. 取出负例-正例的差值
y_pred
=
y_pred
[:,
None
]
-
y_pred
[
None
,
:]
# 这里是算出所有位置 两两之间余弦的差值
# 矩阵中的第i行j列 表示的是第i个余弦值-第j个余弦值
y_true
=
y_true
[:,
None
]
<
y_true
[
None
,
:]
# 取出负例-正例的差值
y_true
=
y_true
.
float
()
y_pred
=
y_pred
-
(
1
-
y_true
)
*
1e12
y_pred
=
y_pred
.
view
(
-
1
)
y_pred
=
torch
.
cat
((
torch
.
tensor
([
0.0
],
device
=
device
),
y_pred
),
dim
=
0
)
# 这里加0是因为e^0 = 1相当于在log中加了1
return
torch
.
logsumexp
(
y_pred
,
dim
=
0
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
batch_token_ids
,
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings
=
model
.
predict
(
batch_token_ids
)
embeddings1
.
append
(
embeddings
[::
2
])
embeddings2
.
append
(
embeddings
[
1
::
2
])
labels
.
append
(
batch_labels
[::
2
])
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_sup_ContrastiveLoss.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# loss: ContrastiveLoss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
from
bert4torch.losses
import
ContrastiveLoss
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
int
(
label
>
2.5
)
if
task_name
==
'STS-B'
else
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pool_cls1
=
self
.
bert
([
token1_ids
])
pool_emb1
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pool_cls2
=
self
.
bert
([
token2_ids
])
pool_emb2
=
get_pool_emb
(
hidden_state2
,
pool_cls2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
distance
=
1
-
torch
.
cosine_similarity
(
pool_emb1
,
pool_emb2
)
return
distance
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
ContrastiveLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token1_ids
).
cpu
())
embeddings2
.
append
(
model
.
predict
(
batch_token2_ids
).
cpu
())
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_sup_CosineMSELoss.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# loss: CosineMSELoss(cos + mse_loss)
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
text1
,
text2
,
label
=
l
label
=
int
(
label
)
/
5
if
task_name
==
'STS-B'
else
int
(
label
)
D
.
append
((
text1
,
text2
,
label
))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pooler1
=
self
.
bert
([
token1_ids
])
pool_emb1
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pooler2
=
self
.
bert
([
token2_ids
])
pool_emb2
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
torch
.
cosine_similarity
(
pool_emb1
,
pool_emb2
)
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
MSELoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
)
# 定义评价函数
def
evaluate
(
model
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
encode
(
batch_token1_ids
))
embeddings2
.
append
(
model
.
encode
(
batch_token2_ids
))
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
model
,
valid_dataloader
)
test_consine
=
evaluate
(
model
,
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
examples/sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# loss: InfoNCE(即sentence_transformer中的MultiNegativeRankingLoss)
# 样本都是正负样本对,因此构造(正,正,负)的三元组时候,正样本对(正,正1)随机抽样负样本为(正,正1,负)
# 负样本对(正,负)重复正样本对(正,正,负)
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
import
random
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# ===========================数据预处理===========================
# 训练
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
3
)]
for
texts
in
batch
:
for
i
,
text
in
enumerate
(
texts
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
i
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
# 加载数据集
def
get_data
(
filename
):
train_data
,
all_texts
=
{},
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
text1
,
text2
,
label
=
l
label
=
str
(
int
(
int
(
label
)
>
2.5
))
if
task_name
==
'STS-B'
else
label
if
text1
not
in
train_data
:
train_data
[
text1
]
=
{
'0'
:
set
(),
'1'
:
set
()}
train_data
[
text1
][
label
].
add
(
text2
)
if
text2
not
in
train_data
:
train_data
[
text2
]
=
{
'0'
:
set
(),
'1'
:
set
()}
train_data
[
text2
][
label
].
add
(
text1
)
all_texts
.
extend
([
text1
,
text2
])
train_samples
=
[]
for
sent1
,
others
in
train_data
.
items
():
if
len
(
others
[
'1'
])
==
0
:
others
[
'1'
]
=
[
sent1
]
# 没有正样本,使用自身作为正阳本,这里其实就是无监督
elif
len
(
others
[
'0'
])
==
0
:
others
[
'0'
]
=
[
random
.
choice
(
all_texts
)]
# 没有负样本,随机挑选一个负样本
# sentence bert的逻辑是下面两个都加进去,这样的问题是如果shuffle=False,处于同一个batch中,相似句可能label给的负样本
if
random
.
random
()
<
0.5
:
train_samples
.
append
((
sent1
,
random
.
choice
(
list
(
others
[
'1'
])),
random
.
choice
(
list
(
others
[
'0'
]))))
else
:
train_samples
.
append
((
random
.
choice
(
list
(
others
[
'1'
])),
sent1
,
random
.
choice
(
list
(
others
[
'0'
]))))
return
train_samples
train_data
=
get_data
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn_eval
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pooler
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 重新生成dataloader,重新random选择样本
train_data
=
get_data
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
)
model
.
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token_ids1
,
batch_token_ids2
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token_ids1
))
embeddings2
.
append
(
model
.
predict
(
batch_token_ids2
))
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
# cosine距离是1-paired
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_sup_concat_CrossEntropyLoss.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# loss: 句向量concat后 (u, v, u-v, u*v) 走CrossEntropyLoss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
label
=
int
(
label
>
2.5
)
if
task_name
==
'STS-B'
else
label
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
concatenation_sent_rep
=
True
,
concatenation_sent_difference
=
True
,
concatenation_sent_multiplication
=
False
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
concatenation_sent_rep
=
concatenation_sent_rep
self
.
concatenation_sent_difference
=
concatenation_sent_difference
self
.
concatenation_sent_multiplication
=
concatenation_sent_multiplication
hidden_unit
=
0
hidden_unit
+=
768
*
2
if
self
.
concatenation_sent_rep
else
0
hidden_unit
+=
768
if
self
.
concatenation_sent_difference
else
0
hidden_unit
+=
768
if
self
.
concatenation_sent_multiplication
else
0
self
.
fc
=
nn
.
Linear
(
hidden_unit
,
2
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pooler1
=
self
.
bert
([
token1_ids
])
rep_a
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pooler2
=
self
.
bert
([
token2_ids
])
rep_b
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
vectors_concat
=
[]
if
self
.
concatenation_sent_rep
:
vectors_concat
.
append
(
rep_a
)
vectors_concat
.
append
(
rep_b
)
if
self
.
concatenation_sent_difference
:
vectors_concat
.
append
(
torch
.
abs
(
rep_a
-
rep_b
))
if
self
.
concatenation_sent_multiplication
:
vectors_concat
.
append
(
rep_a
*
rep_b
)
vectors_concat
=
torch
.
cat
(
vectors_concat
,
dim
=
1
)
return
self
.
fc
(
vectors_concat
)
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token1_ids
).
cpu
())
embeddings2
.
append
(
model
.
predict
(
batch_token2_ids
).
cpu
())
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_unsup_CT.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# ContrastiveTensionLoss: 同一个sentence送入两个模型,pooling后的点积要大
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT | 30.65 | 44.50| 68.67 | 16.20 | 69.27 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
copy
import
random
from
tqdm
import
tqdm
import
numpy
as
np
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
pos_id
=
random
.
randint
(
0
,
len
(
batch
)
-
1
)
pos_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
pos_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
pos_token_ids
)
texts_list
[
1
].
append
(
pos_token_ids
)
labels
.
append
(
1
)
for
neg_id
in
range
(
len
(
batch
)):
if
neg_id
==
pos_id
:
continue
elif
random
.
random
()
<
0.5
:
neg_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
neg_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
pos_token_ids
)
texts_list
[
1
].
append
(
neg_token_ids
)
labels
.
append
(
0
)
else
:
neg_token_ids
,
_
=
tokenizer
.
encode
(
batch
[
neg_id
],
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
neg_token_ids
)
texts_list
[
1
].
append
(
pos_token_ids
)
labels
.
append
(
0
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
model1
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
model2
=
copy
.
deepcopy
(
self
.
model1
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids_list
):
token_ids1
=
token_ids_list
[
0
]
hidden_state1
,
pool_cls1
=
self
.
model1
([
token_ids1
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids1
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids2
=
token_ids_list
[
1
]
hidden_state2
,
pool_cls2
=
self
.
model2
([
token_ids2
])
embeddings_b
=
get_pool_emb
(
hidden_state2
,
pool_cls2
,
token_ids2
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
torch
.
matmul
(
embeddings_a
[:,
None
],
embeddings_b
[:,
:,
None
]).
squeeze
(
-
1
).
squeeze
(
-
1
)
# [btz]
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
model1
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
BCEWithLogitsLoss
(
reduction
=
'mean'
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
# 用足够小的学习率
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_unsup_CT_In-Batch_Negatives.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# loss: 对比学习损失(和simcse类似),只是用了两个模型而已
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | CT_In_Batch_Neg | 32.47 | 47.09| 68.56 | 27.50 | 74.00 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
,
paired_euclidean_distances
,
paired_manhattan_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
copy
import
numpy
as
np
from
tqdm
import
tqdm
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
for
text
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
0
].
append
(
token_ids
)
texts_list
[
1
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
model1
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
model2
=
copy
.
deepcopy
(
self
.
model1
)
self
.
pool_method
=
pool_method
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
token_ids
=
token_ids_list
[
0
]
hidden_state1
,
pooler1
=
self
.
model1
([
token_ids
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids
=
token_ids_list
[
1
]
hidden_state2
,
pooler2
=
self
.
model2
([
token_ids
])
embeddings_b
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
model1
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_unsup_ESimCSE.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# ESimCSE 中文测试
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | ESimCSE | 34.05 | 50.54| 71.58 | 12.53 | 71.27 |
from
bert4torch.snippets
import
sequence_padding
from
tqdm
import
tqdm
import
numpy
as
np
import
scipy.stats
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
get_pool_emb
from
torch.utils.data
import
DataLoader
from
torch
import
optim
,
nn
import
torch
import
random
import
copy
import
sys
from
bert4torch.snippets
import
ListDataset
import
jieba
jieba
.
initialize
()
class
CollateFunc
(
object
):
'''对句子进行复制,和抽取负对
'''
def
__init__
(
self
,
tokenizer
,
max_len
=
256
,
q_size
=
160
,
dup_rate
=
0.15
):
self
.
q
=
[]
self
.
q_size
=
q_size
self
.
max_len
=
max_len
self
.
dup_rate
=
dup_rate
self
.
tokenizer
=
tokenizer
def
word_repetition
(
self
,
batch_text
,
pre_tokenize
=
False
):
dst_text
=
list
()
for
text
in
batch_text
:
if
pre_tokenize
:
cut_text
=
jieba
.
cut
(
text
,
cut_all
=
False
)
text
=
list
(
cut_text
)
actual_len
=
len
(
text
)
dup_len
=
random
.
randint
(
a
=
0
,
b
=
max
(
2
,
int
(
self
.
dup_rate
*
actual_len
)))
try
:
dup_word_index
=
random
.
sample
(
list
(
range
(
1
,
actual_len
)),
k
=
dup_len
)
except
:
dup_word_index
=
set
()
dup_text
=
''
for
index
,
word
in
enumerate
(
text
):
dup_text
+=
word
if
index
in
dup_word_index
:
dup_text
+=
word
dst_text
.
append
(
dup_text
)
return
dst_text
def
negative_samples
(
self
,
batch_src_text
):
batch_size
=
len
(
batch_src_text
)
negative_samples
=
None
if
len
(
self
.
q
)
>
0
:
negative_samples
=
self
.
q
[:
self
.
q_size
]
# print("size of negative_samples", len(negative_samples))
if
len
(
self
.
q
)
+
batch_size
>=
self
.
q_size
:
del
self
.
q
[:
batch_size
]
self
.
q
.
extend
(
batch_src_text
)
return
negative_samples
def
__call__
(
self
,
batch_text
):
'''
input: batch_text: [batch_text,]
output: batch_src_text, batch_dst_text, batch_neg_text
'''
batch_pos_text
=
self
.
word_repetition
(
batch_text
)
batch_neg_text
=
self
.
negative_samples
(
batch_text
)
# print(len(batch_pos_text))
batch_tokens_list
,
batch_pos_tokens_list
=
[],
[]
for
text
,
text_pos
in
zip
(
batch_text
,
batch_pos_text
):
batch_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
])
batch_pos_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text_pos
,
maxlen
=
maxlen
)[
0
])
batch_neg_tokens_list
=
[]
if
batch_neg_text
:
for
text
in
batch_neg_text
:
batch_neg_tokens_list
.
append
(
self
.
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
])
batch_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
batch_pos_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_pos_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
batch_tokens_list
.
size
(
0
),
device
=
batch_tokens_list
.
device
)
if
batch_neg_tokens_list
:
batch_neg_tokens_list
=
torch
.
tensor
(
sequence_padding
(
batch_neg_tokens_list
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_tokens_list
,
batch_pos_tokens_list
,
batch_neg_tokens_list
],
labels
else
:
return
[
batch_tokens_list
,
batch_pos_tokens_list
],
labels
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'STS-B', 0.3 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
# 选用NEZHA和RoFormer选哟修改build_transformer_model的model参数
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
train_call_func
=
CollateFunc
(
tokenizer
,
max_len
=
maxlen
,
q_size
=
64
,
dup_rate
=
0.15
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
train_call_func
)
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
encoder
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
momentum_encoder
=
copy
.
deepcopy
(
self
.
encoder
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
[:
2
]:
hidden_state1
,
pooler
=
self
.
encoder
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
if
len
(
token_ids_list
)
==
3
:
# 负样本
hidden_state1
,
pooler
=
self
.
momentum_encoder
([
token_ids_list
[
2
]])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
encoder
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
class
Momentum
(
object
):
''' 动量更新,这里用scheduler来实现,因为是在optimizer.step()后来调用的
'''
def
__init__
(
self
,
gamma
=
0.95
)
->
None
:
self
.
gamma
=
gamma
def
step
(
self
):
for
encoder_param
,
moco_encoder_param
in
zip
(
model
.
encoder
.
parameters
(),
model
.
momentum_encoder
.
parameters
()):
moco_encoder_param
.
data
=
self
.
gamma
*
moco_encoder_param
.
data
+
(
1.
-
self
.
gamma
)
*
encoder_param
.
data
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
scheduler
=
Momentum
(
gamma
=
0.95
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
def
evaluate
(
dataloader
):
# 模型预测
# 标准化,相似度,相关系数
sims_list
,
labels
=
[],
[]
for
(
a_token_ids
,
b_token_ids
),
label
in
tqdm
(
dataloader
):
a_vecs
=
model
.
encode
(
a_token_ids
)
b_vecs
=
model
.
encode
(
b_token_ids
)
a_vecs
=
torch
.
nn
.
functional
.
normalize
(
a_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
b_vecs
=
torch
.
nn
.
functional
.
normalize
(
b_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
sims_list
.
append
(
sims
)
labels
.
append
(
label
.
cpu
().
numpy
())
corrcoef
=
scipy
.
stats
.
spearmanr
(
np
.
concatenate
(
labels
),
np
.
concatenate
(
sims_list
)).
correlation
return
corrcoef
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
5
,
callbacks
=
[
evaluator
])
examples/sentence_embedding/task_sentence_embedding_unsup_PromptBert.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# promptbert实现sentence embedding
# 官方项目:https://github.com/kongds/Prompt-BERT
# 参考项目:https://github.com/Macielyoung/sentence_representation_matching
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | PromptBert | 33.98 | 49.89| 73.18 | 13.30 | 73.42 |
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
tqdm
import
tqdm
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
ListDataset
,
sequence_padding
,
Callback
from
torch.utils.data
import
DataLoader
from
scipy.stats
import
pearsonr
,
spearmanr
import
numpy
as
np
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, task_name, dropout_rate = 'BERT', 'ATEC', 0.3 # debug使用
print
(
model_type
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
template_len
=
15
if
task_name
==
'PAWSX'
:
maxlen
=
128
+
template_len
else
:
maxlen
=
64
+
template_len
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
),
add_special_tokens
=
'[X]'
)
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
add_special_tokens
=
'[X]'
)
replace_token
=
"[X]"
mask_token
=
"[MASK]"
prompt_templates
=
[
'"{}" 的意思为[MASK]'
.
format
(
replace_token
),
'"{}"这句话的意思是[MASK]'
.
format
(
replace_token
)]
tao
=
0.05
token_dict
=
load_vocab
(
dict_path
)
compound_tokens
=
[[
len
(
token_dict
)]]
token_dict
[
'[X]'
]
=
len
(
token_dict
)
# 加载数据集
def
load_data
(
filenames
):
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
tqdm
(
f
.
readlines
(),
desc
=
'Load data'
):
cache
=
line
.
split
(
'
\t
'
)
text1
,
text2
,
label
=
cache
[
0
][:
maxlen
-
template_len
],
cache
[
1
][:
maxlen
-
template_len
],
cache
[
-
1
]
for
text
in
[
text1
,
text2
]:
sentence_pair
=
[]
for
template
in
prompt_templates
:
sent_num
=
len
(
tokenizer
.
tokenize
(
text
))
prompt_sent
=
template
.
replace
(
replace_token
,
text
)
template_sent
=
template
.
replace
(
replace_token
,
replace_token
*
sent_num
)
sentence_pair
.
extend
([
prompt_sent
,
template_sent
])
D
.
append
((
sentence_pair
,
int
(
label
)))
return
D
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
train_texts
=
load_data
(
all_names
)
valid_texts
=
list
(
zip
(
train_texts
[::
2
],
train_texts
[
1
::
2
]))
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
batch_tensor
=
[[]
for
_
in
range
(
4
)]
for
prompt_data
,
_
in
batch
:
for
i
,
item
in
enumerate
(
prompt_data
):
batch_tensor
[
i
].
append
(
tokenizer
.
encode
(
item
,
maxlen
=
maxlen
)[
0
])
for
i
,
item
in
enumerate
(
batch_tensor
):
batch_tensor
[
i
]
=
torch
.
tensor
(
sequence_padding
(
item
,
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
batch_tensor
[
0
].
size
(
0
),
device
=
device
)
return
batch_tensor
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_test
(
batch
):
text1_ids
,
text2_ids
,
labels
=
[],
[],
[]
for
text1
,
text2
in
batch
:
label
=
text1
[
-
1
]
text1
,
text2
=
text1
[
0
][
0
],
text2
[
0
][
0
]
text1_ids
.
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
text2_ids
.
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
text1_ids
=
torch
.
tensor
(
sequence_padding
(
text1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
text2_ids
=
torch
.
tensor
(
sequence_padding
(
text2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
text1_ids
,
text2_ids
],
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
valid_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_test
)
# =============================定义模型=============================
class
PromptBert
(
BaseModel
):
def
__init__
(
self
,
scale
=
20.0
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
model_name
,
dropout_rate
=
dropout_rate
,
segment_vocab_size
=
0
,
compound_tokens
=
compound_tokens
)
self
.
scale
=
scale
def
forward
(
self
,
prompt0_input
,
template0_input
,
prompt1_input
,
template1_input
):
embeddings_a
=
self
.
get_sentence_embedding
(
prompt0_input
,
template0_input
)
embeddings_b
=
self
.
get_sentence_embedding
(
prompt1_input
,
template1_input
)
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
get_sentence_embedding
(
self
,
prompt_input_ids
,
template_input_ids
):
prompt_mask_embedding
=
self
.
get_mask_embedding
(
prompt_input_ids
)
template_mask_embedding
=
self
.
get_mask_embedding
(
template_input_ids
)
# 在计算损失函数时为了消除Prompt模板影响,通过替换模板后的句子[MASK]获取的表征减去模板中[MASK]获取的表征来得到句子向量表征
sentence_embedding
=
prompt_mask_embedding
-
template_mask_embedding
return
sentence_embedding
def
get_mask_embedding
(
self
,
input_ids
):
last_hidden_state
=
self
.
bert
([
input_ids
])
mask_index
=
(
input_ids
==
tokenizer
.
_token_mask_id
).
long
()
input_mask_expanded
=
mask_index
.
unsqueeze
(
-
1
).
expand
(
last_hidden_state
.
size
()).
float
()
mask_embedding
=
torch
.
sum
(
last_hidden_state
*
input_mask_expanded
,
1
)
return
mask_embedding
def
predict
(
self
,
input_ids
):
self
.
eval
()
with
torch
.
no_grad
():
mask_embedding
=
self
.
get_mask_embedding
(
input_ids
)
return
mask_embedding
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
PromptBert
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_sim
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_sim
=
self
.
evaluate
(
valid_dataloader
)
if
val_sim
>
self
.
best_val_sim
:
self
.
best_val_sim
=
val_sim
# model.save_weights('best_model.pt')
print
(
f
'val_sim:
{
val_sim
:.
5
f
}
, best_val_sim:
{
self
.
best_val_sim
:.
5
f
}
\n
'
)
@
staticmethod
def
evaluate
(
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
text1_ids
,
text2_ids
),
label
in
data
:
embeddings1
.
append
(
model
.
predict
(
text1_ids
))
embeddings2
.
append
(
model
.
predict
(
text2_ids
))
labels
.
append
(
label
)
embeddings1
=
torch
.
cat
(
embeddings1
)
embeddings2
=
torch
.
cat
(
embeddings2
)
labels
=
torch
.
cat
(
labels
)
sims
=
F
.
cosine_similarity
(
embeddings1
,
embeddings2
).
cpu
().
numpy
()
labels
=
labels
.
cpu
().
numpy
()
return
spearmanr
(
sims
,
labels
)[
0
]
if
__name__
==
"__main__"
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
examples/sentence_embedding/task_sentence_embedding_unsup_SimCSE.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# SimCSE 中文测试
# bert4keras链接:https://kexue.fm/archives/8348
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | SimCSE | 33.90 | 50.29| 71.81 | 13.14 | 71.09 |
from
bert4torch.snippets
import
sequence_padding
from
tqdm
import
tqdm
import
numpy
as
np
import
scipy.stats
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
get_pool_emb
from
torch.utils.data
import
DataLoader
from
torch
import
optim
,
nn
import
torch
from
bert4torch.snippets
import
ListDataset
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.3 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
for
text
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
texts_list
[
0
].
append
(
token_ids
)
texts_list
[
1
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pooler
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
def
evaluate
(
dataloader
):
# 模型预测
# 标准化,相似度,相关系数
sims_list
,
labels
=
[],
[]
for
(
a_token_ids
,
b_token_ids
),
label
in
tqdm
(
dataloader
):
a_vecs
=
model
.
encode
(
a_token_ids
)
b_vecs
=
model
.
encode
(
b_token_ids
)
a_vecs
=
torch
.
nn
.
functional
.
normalize
(
a_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
b_vecs
=
torch
.
nn
.
functional
.
normalize
(
b_vecs
,
p
=
2
,
dim
=
1
).
cpu
().
numpy
()
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
sims_list
.
append
(
sims
)
labels
.
append
(
label
.
cpu
().
numpy
())
corrcoef
=
scipy
.
stats
.
spearmanr
(
np
.
concatenate
(
labels
),
np
.
concatenate
(
sims_list
)).
correlation
return
corrcoef
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
5
,
callbacks
=
[
evaluator
])
examples/sentence_embedding/task_sentence_embedding_unsup_TSDAE.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# 语义相似度任务-无监督
# 一个encoder输入删减后的句子生成句向量,decoder依据这个句子向量来恢复原句
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B | comment |
# | TSDAE | —— | 46.65| 65.30 | 12.54 | —— | ——表示该指标异常未记录 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
pearsonr
,
spearmanr
import
numpy
as
np
import
re
from
tqdm
import
tqdm
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
model_type
,
pooling
,
task_name
,
dropout_rate
=
sys
.
argv
[
1
:]
# 传入参数
# model_type, pooling, task_name, dropout_rate = 'BERT', 'cls', 'ATEC', 0.1 # debug使用
print
(
model_type
,
pooling
,
task_name
,
dropout_rate
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
dropout_rate
=
float
(
dropout_rate
)
batch_size
=
32
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
def
load_data
(
filenames
):
"""加载数据(带标签)
单条格式:(文本1, 文本2, 标签)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
return
D
all_texts
=
load_data
(
all_names
)
train_texts
=
[
j
for
i
in
all_texts
for
j
in
i
[:
2
]]
if
task_name
!=
'PAWSX'
:
np
.
random
.
shuffle
(
train_texts
)
train_texts
=
train_texts
[:
10000
]
# 加载训练数据集
def
collate_fn
(
batch
):
def
add_noise
(
token_ids
,
del_ratio
=
0.6
):
n
=
len
(
token_ids
)
keep_or_not
=
np
.
random
.
rand
(
n
)
>
del_ratio
if
sum
(
keep_or_not
)
==
0
:
keep_or_not
[
np
.
random
.
choice
(
n
)]
=
True
# guarantee that at least one word remains
return
list
(
np
.
array
(
token_ids
)[
keep_or_not
])
texts_list
=
[[]
for
_
in
range
(
3
)]
for
text
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
0
].
append
([
tokenizer
.
_token_start_id
]
+
add_noise
(
token_ids
[
1
:
-
1
])
+
[
tokenizer
.
_token_end_id
])
texts_list
[
1
].
append
(
token_ids
[:
-
1
])
texts_list
[
2
].
append
(
token_ids
[
1
:])
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
return
texts_list
[:
2
],
texts_list
[
2
].
flatten
()
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_texts
),
shuffle
=
True
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 加载测试数据集
def
collate_fn_eval
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
texts_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
texts_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
texts_list
,
labels
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_texts
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
encoder
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
model_name
,
segment_vocab_size
=
0
,
dropout_rate
=
dropout_rate
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
# 用bert的权重来初始化decoder,crossAttn部分是随机初始化的
self
.
decoder
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
model_name
,
application
=
'lm'
,
dropout_rate
=
dropout_rate
,
output_all_encoded_layers
=
output_all_encoded_layers
,
is_decoder
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
# 绑定encoder和decoder的权重
decoder_names
=
{
k
for
k
,
_
in
self
.
decoder
.
named_parameters
()}
for
enc_k
,
v
in
self
.
encoder
.
named_parameters
():
dec_k
=
enc_k
if
dec_k
in
decoder_names
:
rep_str
=
f
'self.encoder.
{
enc_k
}
= self.decoder.
{
dec_k
}
'
if
re
.
search
(
'\.[0-9]+\.'
,
rep_str
):
temp
=
'['
+
re
.
findall
(
'\.[0-9]+\.'
,
rep_str
)[
0
][
1
:
-
1
]
+
'].'
rep_str
=
re
.
sub
(
'\.[0-9]+\.'
,
temp
,
rep_str
)
exec
(
rep_str
)
else
:
print
(
enc_k
,
dec_k
)
def
forward
(
self
,
token_ids_list
):
token_ids1
=
token_ids_list
[
0
]
hidden_state1
,
pool_cls1
=
self
.
encoder
([
token_ids1
])
embeddings_a
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids1
.
gt
(
0
).
long
(),
self
.
pool_method
)
token_ids2
=
token_ids_list
[
1
]
encoder_embedding
=
embeddings_a
.
unsqueeze
(
1
)
encoder_attention_mask
=
torch
.
ones_like
(
token_ids1
)[:,
0
:
1
][:,
None
,
None
,
:]
_
,
logits
=
self
.
decoder
([
token_ids2
,
encoder_embedding
,
encoder_attention_mask
])
return
logits
.
reshape
(
-
1
,
logits
.
shape
[
-
1
])
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
encoder
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
model
=
Model
(
pool_method
=
pooling
).
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-4
),
)
# 定义评价函数
def
evaluate
(
data
):
cosine_scores
,
labels
=
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
label
in
tqdm
(
data
):
embeddings1
=
model
.
encode
(
batch_token1_ids
).
cpu
().
numpy
()
embeddings2
=
model
.
encode
(
batch_token2_ids
).
cpu
().
numpy
()
cosine_score
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
cosine_scores
.
append
(
cosine_score
)
labels
.
append
(
label
)
cosine_scores
=
np
.
concatenate
(
cosine_scores
)
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sentence_embedding/task_sentence_embedding_unsup_bert_whitening.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert_whitening
# 官方项目:https://github.com/bojone/BERT-whitening
# cls+不降维
# | solution | ATEC | BQ | LCQMC | PAWSX | STS-B |
# | Bert-whitening | 26.79 | 31.81| 56.34 | 17.22 | 67.45 |
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
get_pool_emb
from
bert4torch.layers
import
BERT_WHITENING
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
scipy.stats
import
sys
import
jieba
jieba
.
initialize
()
# =============================基本参数=============================
# model_type, pooling, task_name, n_components = sys.argv[1:] # 传入参数
model_type
,
pooling
,
task_name
,
n_components
=
'BERT'
,
'cls'
,
'ATEC'
,
-
1
# debug使用
print
(
model_type
,
pooling
,
task_name
,
n_components
)
assert
model_type
in
{
'BERT'
,
'RoBERTa'
,
'NEZHA'
,
'RoFormer'
,
'SimBERT'
}
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
assert
task_name
in
{
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
}
if
model_type
in
{
'BERT'
,
'RoBERTa'
,
'SimBERT'
}:
model_name
=
'bert'
elif
model_type
in
{
'RoFormer'
}:
model_name
=
'roformer'
elif
model_type
in
{
'NEZHA'
}:
model_name
=
'nezha'
n_components
=
int
(
n_components
)
if
n_components
<
0
:
if
model_type
.
endswith
(
'large'
):
n_components
=
1024
elif
model_type
.
endswith
(
'tiny'
):
n_components
=
312
elif
model_type
.
endswith
(
'small'
):
n_components
=
384
else
:
n_components
=
768
batch_size
=
128
if
task_name
==
'PAWSX'
:
maxlen
=
128
else
:
maxlen
=
64
# bert配置
model_dir
=
{
'BERT'
:
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12'
,
'RoBERTa'
:
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base'
,
'NEZHA'
:
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base'
,
'RoFormer'
:
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base'
,
'SimBERT'
:
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base'
,
}[
model_type
]
config_path
=
f
'
{
model_dir
}
/bert_config.json'
if
model_type
==
'BERT'
else
f
'
{
model_dir
}
/config.json'
checkpoint_path
=
f
'
{
model_dir
}
/pytorch_model.bin'
dict_path
=
f
'
{
model_dir
}
/vocab.txt'
data_path
=
'F:/Projects/data/corpus/sentence_embedding/'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# =============================加载数据集=============================
# 建立分词器
if
model_type
in
[
'RoFormer'
]:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
lcut
(
s
,
HMM
=
False
))
else
:
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 读数据
all_names
=
[
f
'
{
data_path
}{
task_name
}
/
{
task_name
}
.
{
f
}
.data'
for
f
in
[
'train'
,
'valid'
,
'test'
]]
print
(
all_names
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
float
(
l
[
2
])))
# if len(D) > 1000:
# break
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
all_names
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'mean'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 提取训练集的所有句向量
sen_emb_list
,
sen_labels
=
[],
[]
for
token_ids
,
labels
in
tqdm
(
train_dataloader
,
desc
=
'Encoding'
):
sen1_emb
=
model
.
encode
(
token_ids
[
0
])
sen2_emb
=
model
.
encode
(
token_ids
[
1
])
sen_emb_list
.
append
((
sen1_emb
,
sen2_emb
))
sen_labels
.
append
(
labels
)
# 调用bert_whitening模块
bert_whitening
=
BERT_WHITENING
()
if
n_components
>
0
:
bert_whitening
.
compute_kernel_bias
([
v
for
vecs
in
sen_emb_list
for
v
in
vecs
])
bert_whitening
.
kernel
=
bert_whitening
.
kernel
[:,
:
n_components
]
# 变换,标准化,相似度,相关系数
all_sims
=
[]
for
(
a_vecs
,
b_vecs
)
in
tqdm
(
sen_emb_list
,
desc
=
'Transform'
):
a_vecs
=
bert_whitening
.
transform_and_normalize
(
a_vecs
)
b_vecs
=
bert_whitening
.
transform_and_normalize
(
b_vecs
)
sims
=
(
a_vecs
*
b_vecs
).
sum
(
axis
=
1
)
all_sims
.
append
(
sims
)
all_sims
=
torch
.
cat
(
all_sims
,
dim
=
0
)
sen_labels
=
torch
.
cat
(
sen_labels
,
dim
=
0
)
corrcoef
=
scipy
.
stats
.
spearmanr
(
sen_labels
.
cpu
().
numpy
(),
all_sims
.
cpu
().
numpy
()).
correlation
print
(
f
'
{
task_name
}
corrcoefs: '
,
corrcoef
)
\ No newline at end of file
examples/seq2seq/task_kgclue_seq2seq.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# KgCLUE baseline
# 直接用UniLM做Seq2Seq,然后前缀树约束解码,并加入自研的“前瞻”策略;
# 基础模型为RoFormer-Sim-FT,相比直接用RoFormer/BERT/RoBERTa有2%的提升;
# 介绍链接:https://kexue.fm/archives/8802
import
os
,
json
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
import
torch.optim
as
optim
import
torch.nn
as
nn
import
torch
import
torch.nn.functional
as
F
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
ListDataset
,
sequence_padding
,
AutoRegressiveDecoder
,
Callback
from
tqdm
import
tqdm
from
collections
import
defaultdict
# import pylcs
def
lcs
(
source
,
target
):
"""最长公共子序列(source和target的最长非连续子序列)
返回:子序列长度, 映射关系(映射对组成的list)
注意:最长公共子序列可能不止一个,所返回的映射只代表其中一个。
"""
c
=
defaultdict
(
int
)
for
i
,
si
in
enumerate
(
source
,
1
):
for
j
,
tj
in
enumerate
(
target
,
1
):
if
si
==
tj
:
c
[
i
,
j
]
=
c
[
i
-
1
,
j
-
1
]
+
1
elif
c
[
i
,
j
-
1
]
>
c
[
i
-
1
,
j
]:
c
[
i
,
j
]
=
c
[
i
,
j
-
1
]
else
:
c
[
i
,
j
]
=
c
[
i
-
1
,
j
]
l
,
mapping
=
c
[
len
(
source
),
len
(
target
)],
[]
i
,
j
=
len
(
source
)
-
1
,
len
(
target
)
-
1
while
len
(
mapping
)
<
l
:
if
source
[
i
]
==
target
[
j
]:
mapping
.
append
((
i
,
j
))
i
,
j
=
i
-
1
,
j
-
1
elif
c
[
i
+
1
,
j
]
>
c
[
i
,
j
+
1
]:
j
=
j
-
1
else
:
i
=
i
-
1
return
l
,
mapping
[::
-
1
]
def
subject_split
(
s
):
"""如果有义项,那么单独分离出来
"""
m
=
''
if
s
[
-
1
]
==
u
')'
:
i
=
s
.
index
(
u
'('
)
m
=
s
[
i
+
1
:
-
1
]
s
=
s
[:
i
]
return
s
,
m
def
load_data
(
filename
):
"""读取数据集
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
s
,
p
,
o
=
l
[
'answer'
].
split
(
' ||| '
)
s
,
m
=
subject_split
(
s
)
D
.
append
((
l
[
'question'
],
(
s
,
p
,
m
,
' '
.
join
(
o
.
split
()))))
return
D
class
Trie
(
object
):
"""自定义Trie树对象,用来保存知识库
"""
def
__init__
(
self
,
value_key
=-
1
):
self
.
data
=
{}
self
.
value_key
=
str
(
value_key
)
def
__setitem__
(
self
,
key
,
value
):
"""传入一对(key, value)到前缀树中
"""
data
=
self
.
data
for
k
in
key
:
k
=
str
(
k
)
if
k
not
in
data
:
data
[
k
]
=
{}
data
=
data
[
k
]
if
self
.
value_key
in
data
:
if
data
[
self
.
value_key
]
!=
value
:
data
[
self
.
value_key
]
+=
(
'
\t
'
+
value
)
else
:
data
[
self
.
value_key
]
=
value
def
__getitem__
(
self
,
key
):
"""获取key对应的value
"""
data
=
self
.
data
for
k
in
key
:
k
=
str
(
k
)
data
=
data
[
k
]
return
data
[
self
.
value_key
]
def
next_ones
(
self
,
prefix
):
"""获取prefix后一位的容许集
"""
data
=
self
.
data
for
k
in
prefix
:
k
=
str
(
k
)
data
=
data
[
k
]
return
[
k
for
k
in
data
if
k
!=
self
.
value_key
]
def
keys
(
self
,
prefix
=
None
,
data
=
None
):
"""获取以prefix开头的所有key
"""
data
=
data
or
self
.
data
prefix
=
prefix
or
[]
for
k
in
prefix
:
k
=
str
(
k
)
if
k
not
in
data
:
return
[]
data
=
data
[
k
]
results
=
[]
for
k
in
data
:
if
k
==
self
.
value_key
:
results
.
append
([])
else
:
results
.
extend
([[
k
]
+
j
for
j
in
self
.
keys
(
None
,
data
[
k
])])
return
[
prefix
+
i
for
i
in
results
]
def
save
(
self
,
filename
):
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
self
.
data
,
f
,
ensure_ascii
=
False
)
def
load
(
self
,
filename
):
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
self
.
data
=
json
.
load
(
f
)
# 基本参数
maxlen
=
128
batch_size
=
32
epochs
=
10
# 模型路径
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_ft_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 转换知识库
KG
=
Trie
()
if
os
.
path
.
exists
(
'../datasets/KG.json'
):
KG
.
load
(
'../datasets/KG.json'
)
else
:
with
open
(
'F:/Projects/data/corpus/kg/KgCLUE/Knowledge_20211215.txt'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
# count = 0
for
l
in
tqdm
(
f
):
s
,
p
,
o
=
l
.
split
(
'
\t
'
)
s
,
m
=
subject_split
(
s
)
ids
=
tokenizer
.
encode
(
s
,
p
)[
0
][
1
:]
ids
+=
tokenizer
.
encode
(
m
)[
0
][
1
:
-
1
]
KG
[
ids
]
=
' '
.
join
(
o
.
split
())
# count += 1
# if count > 10000:
# break
KG
.
save
(
'../datasets/KG.json'
)
def
collate_fn
(
batch
):
"""数据生成器
单条样本:[CLS] Q [SEP] S [SEP] P [SEP] M [SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
(
q
,
a
)
in
batch
:
q_ids
=
tokenizer
.
encode
(
q
,
maxlen
=
maxlen
//
2
+
1
)[
0
]
a_ids
=
tokenizer
.
encode
(
a
[
0
],
a
[
1
])[
0
]
a_ids
+=
tokenizer
.
encode
(
a
[
2
])[
0
][
1
:]
token_ids
=
(
q_ids
+
a_ids
[
1
:])[:
maxlen
]
segment_ids
=
[
0
]
*
len
(
q_ids
)
segment_ids
+=
[
1
]
*
(
len
(
token_ids
)
-
len
(
q_ids
))
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
# 读取数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/train.json'
)
train_dataloader
=
DataLoader
(
ListDataset
(
train_data
),
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/dev.json'
)
test_data
=
load_data
(
'F:/Projects/data/corpus/kg/KgCLUE/test_public.json'
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'roformer'
,
application
=
'unilm'
).
to
(
device
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
5e-6
))
class
AutoQA
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
all_token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
all_token_ids
,
segment_ids
])
probas
=
F
.
softmax
(
y_pred
[:,
-
1
,
:],
dim
=-
1
)
new_probas
=
torch
.
zeros_like
(
probas
)
for
i
,
ids
in
enumerate
(
output_ids
):
ids
=
ids
.
cpu
().
numpy
()
next_ids
=
[
int
(
j
)
for
j
in
KG
.
next_ones
(
ids
)]
# 下一位容许集
# ===========如果t时刻为Pt的前缀树中的短句,带来的信息增益越大,则增加Pt的概率
if
len
(
next_ids
)
>
1
and
self
.
end_id
in
ids
:
# 容许集大于1且已解码出S
candidates
=
KG
.
keys
(
list
(
ids
))
# 可能解码结果
weights
=
torch
.
ones_like
(
probas
[
i
])
# 默认权重为1
lcs0
=
lcs
(
ids
,
token_ids
[
i
])[
0
]
# 当前已经覆盖的token数
for
c
in
candidates
:
if
len
(
c
)
>
len
(
ids
):
c
=
[
int
(
j
)
for
j
in
c
]
w
=
lcs
(
c
,
token_ids
[
i
])[
0
]
-
lcs0
# 未来还可能覆盖的token数
weights
[
c
[
len
(
ids
)]]
=
max
(
w
+
1
,
weights
[
c
[
len
(
ids
)]].
cpu
().
numpy
())
probas
[
i
]
=
torch
.
pow
(
probas
[
i
],
1.
/
weights
)
# 按 p^(1/n) 来增大权重
if
not
next_ids
:
# 如果容许集为空,意味着要结束了
next_ids
.
append
(
self
.
end_id
)
new_probas
[
i
,
next_ids
]
+=
probas
[
i
,
next_ids
]
# 只保留容许集概率
new_probas
/=
new_probas
.
sum
(
axis
=
1
,
keepdims
=
True
)
# 重新归一化
return
new_probas
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
,
min_ends
=
3
)
# 基于beam search
end_idxs
=
[
i
for
i
,
j
in
enumerate
(
output_ids
)
if
j
==
self
.
end_id
]
subject_ids
=
output_ids
[:
end_idxs
[
0
]]
predicate_ids
=
output_ids
[
end_idxs
[
0
]:
end_idxs
[
1
]]
meaning_ids
=
output_ids
[
end_idxs
[
1
]:]
return
(
tokenizer
.
decode
(
subject_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
predicate_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
meaning_ids
.
cpu
().
numpy
()),
KG
[
output_ids
[:
-
1
].
cpu
().
numpy
()]
)
autoqa
=
AutoQA
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_score
=
0
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
em
,
f1
,
score
=
self
.
evaluate
(
valid_data
,
topk
=
3
)
if
score
>=
self
.
best_score
:
self
.
best_score
=
score
# model.save_weights('./best_model.weights')
print
(
u
'[VALID] em: %.5f, f1: %.5f, score: %.5f, best_score: %.5f
\n
'
%
(
em
,
f1
,
score
,
self
.
best_score
)
)
def
f1sim
(
self
,
text_a
,
text_b
):
"""计算两个文本之间的f1相似度
说明:算出两个文本的最长公共子序列长度,然后乘2并处以两者
长度之和。推荐用pylcs算,速度较快。
"""
if
not
text_a
and
not
text_b
:
return
0.
else
:
lcs_len
=
lcs
(
text_a
,
text_b
)[
0
]
return
2.
*
lcs_len
/
(
len
(
text_a
)
+
len
(
text_b
))
def
evaluate
(
self
,
data
,
topk
=
1
):
"""评估函数
注意:同一(S, P)对应的O可能有多个,但标注数据只保留了
一个,为了跟标注数据对齐来提高分数,这里也只保留第一个。
"""
em
,
f1
,
total
=
0.
,
0.
,
0.
for
d
in
tqdm
(
data
,
ncols
=
0
):
a
=
autoqa
.
generate
(
d
[
0
],
topk
=
topk
)
o
=
a
[
3
].
split
(
'
\t
'
)[
0
]
# 如果有多个,只保留第一个
em
+=
float
(
o
==
d
[
1
][
3
])
f1
+=
self
.
f1sim
(
o
,
d
[
1
][
3
])
total
+=
1
em
/=
total
f1
/=
total
return
em
,
f1
,
(
em
+
f1
)
/
2
def
test_predict
(
in_file
,
out_file
,
topk
=
1
):
"""输出测试结果到文件
结果文件可以提交到 https://www.cluebenchmarks.com 评测。
"""
fw
=
open
(
out_file
,
'w'
)
with
open
(
in_file
)
as
fr
:
for
l
in
tqdm
(
fr
):
l
=
json
.
loads
(
l
)
s
,
p
,
m
,
o
=
autoqa
.
generate
(
l
[
'question'
],
topk
=
topk
)
if
m
:
s
+=
u
'(%s)'
%
m
l
[
'answer'
]
=
'%s ||| %s ||| %s'
%
(
s
,
p
,
o
.
split
(
'
\t
'
)[
0
])
l
=
json
.
dumps
(
l
,
ensure_ascii
=
False
)
fw
.
write
(
l
+
'
\n
'
)
fw
.
close
()
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
model
.
load_weights
(
'./best_model.weights'
)
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
1
)
print
(
u
'[TEST] topk=1, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
3
)
print
(
u
'[TEST] topk=3, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
em
,
f1
,
score
=
evaluator
.
evaluate
(
test_data
,
topk
=
5
)
print
(
u
'[TEST] topk=5, em: %.5f, f1: %.5f, score: %.5f'
%
(
em
,
f1
,
score
))
else
:
model
.
load_weights
(
'./best_model.weights'
)
# test_predict('../datasets/test.json', 'kgclue_predict.json', topk=3)
\ No newline at end of file
examples/seq2seq/task_question_answer_generation_by_seq2seq.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8- -*-
# 用Seq2Seq做阅读理解构建
# 根据篇章先采样生成答案,然后采样生成问题
# 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
numpy
as
np
# 基本参数
max_p_len
=
128
max_q_len
=
64
max_a_len
=
16
batch_size
=
24
epochs
=
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 筛选数据
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
data
=
[]
for
d
in
webqa_data
+
sogou_data
:
for
p
in
d
[
'passages'
]:
if
p
[
'answer'
]:
for
t
in
text_segmentate
(
p
[
'passage'
],
max_p_len
-
2
,
seps
,
strips
):
if
p
[
'answer'
]
in
t
:
data
.
append
((
t
,
d
[
'question'
],
p
[
'answer'
]))
del
webqa_data
del
sogou_data
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
10
!=
0
]
valid_data
=
[
data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
10
==
0
]
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
,
'w'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json'
,
'w'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
(
p
,
q
,
a
)
in
batch
:
p_token_ids
,
_
=
tokenizer
.
encode
(
p
,
maxlen
=
max_p_len
+
1
)
a_token_ids
,
_
=
tokenizer
.
encode
(
a
,
maxlen
=
max_a_len
)
q_token_ids
,
_
=
tokenizer
.
encode
(
q
,
maxlen
=
max_q_len
)
token_ids
=
p_token_ids
+
a_token_ids
[
1
:]
+
q_token_ids
[
1
:]
# 去掉answer和question的cls位
segment_ids
=
[
0
]
*
len
(
p_token_ids
)
segment_ids
+=
[
1
]
*
(
len
(
token_ids
)
-
len
(
p_token_ids
))
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data_list_format.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data_list_format.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
QuestionAnswerGeneration
(
AutoRegressiveDecoder
):
"""随机生成答案,并且通过beam search来生成问题
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
passage
,
topk
=
1
,
topp
=
0.95
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
)
a_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
1
,
topp
=
topp
)[
0
]
# 基于随机采样
token_ids
+=
list
(
a_ids
)
segment_ids
+=
[
1
]
*
len
(
a_ids
)
q_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
(
tokenizer
.
decode
(
q_ids
.
cpu
().
numpy
()),
tokenizer
.
decode
(
a_ids
.
cpu
().
numpy
()))
qag
=
QuestionAnswerGeneration
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_q_len
,
device
=
device
)
def
predict_to_file
(
data
,
filename
,
topk
=
1
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q
,
a
=
qag
.
generate
(
d
[
0
])
s
=
'%s
\t
%s
\t
%s
\n
'
%
(
q
,
a
,
d
[
0
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
# predict_to_file(valid_data, 'qa.csv')
examples/seq2seq/task_reading_comprehension_by_mlm.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 用MLM的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
re
import
torch.nn.functional
as
F
# 基本参数
max_p_len
=
256
max_q_len
=
64
max_a_len
=
32
batch_size
=
12
epochs
=
10
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
sogou_data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
!=
0
]
valid_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
==
0
]
train_data
.
extend
(
train_data
)
train_data
.
extend
(
webqa_data
)
# 将SogouQA和WebQA按2:1的比例混合
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
,
'[MASK]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式为
输入: [CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
输出: 答案
"""
batch_token_ids
,
batch_segment_ids
,
batch_a_token_ids
=
[],
[],
[]
for
D
in
batch
:
question
=
D
[
'question'
]
answers
=
[
p
[
'answer'
]
for
p
in
D
[
'passages'
]
if
p
[
'answer'
]]
passage
=
np
.
random
.
choice
(
D
[
'passages'
])[
'passage'
]
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
final_answer
=
''
for
answer
in
answers
:
if
all
([
a
in
passage
[:
max_p_len
-
2
]
for
a
in
answer
.
split
(
' '
)]):
final_answer
=
answer
.
replace
(
' '
,
','
)
break
a_token_ids
,
_
=
tokenizer
.
encode
(
final_answer
,
maxlen
=
max_a_len
+
1
)
q_token_ids
,
_
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)
p_token_ids
,
_
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
token_ids
=
[
tokenizer
.
_token_start_id
]
token_ids
+=
([
tokenizer
.
_token_mask_id
]
*
max_a_len
)
token_ids
+=
[
tokenizer
.
_token_end_id
]
token_ids
+=
(
q_token_ids
[
1
:]
+
p_token_ids
[
1
:])
segment_ids
=
[
0
]
*
len
(
token_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_a_token_ids
.
append
(
a_token_ids
[
1
:])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_a_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_a_token_ids
,
max_a_len
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_a_token_ids
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, max_a_len]
'''
_
,
y_pred
=
outputs
y_pred
=
y_pred
[:,
1
:
max_a_len
+
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
y_true
.
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
def
get_ngram_set
(
x
,
n
):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result
=
{}
for
i
in
range
(
len
(
x
)
-
n
+
1
):
k
=
tuple
(
x
[
i
:
i
+
n
])
if
k
[:
-
1
]
not
in
result
:
result
[
k
[:
-
1
]]
=
set
()
result
[
k
[:
-
1
]].
add
(
k
[
-
1
])
return
result
def
gen_answer
(
question
,
passages
):
"""由于是MLM模型,所以可以直接argmax解码。
"""
all_p_token_ids
,
token_ids
,
segment_ids
=
[],
[],
[]
for
passage
in
passages
:
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
p_token_ids
,
_
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
q_token_ids
,
_
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)
all_p_token_ids
.
append
(
p_token_ids
[
1
:])
token_ids
.
append
([
tokenizer
.
_token_start_id
])
token_ids
[
-
1
]
+=
([
tokenizer
.
_token_mask_id
]
*
max_a_len
)
token_ids
[
-
1
]
+=
[
tokenizer
.
_token_end_id
]
token_ids
[
-
1
]
+=
(
q_token_ids
[
1
:]
+
p_token_ids
[
1
:])
segment_ids
.
append
([
0
]
*
len
(
token_ids
[
-
1
]))
token_ids
=
torch
.
tensor
(
sequence_padding
(
token_ids
),
device
=
device
)
segment_ids
=
torch
.
tensor
(
sequence_padding
(
segment_ids
),
device
=
device
)
logit
=
model
.
predict
([
token_ids
,
segment_ids
])[
-
1
][:,
1
:
max_a_len
+
1
,
:]
probas
=
F
.
softmax
(
logit
,
dim
=-
1
)
results
=
{}
for
t
,
p
in
zip
(
all_p_token_ids
,
probas
):
a
,
score
=
tuple
(),
0.
for
i
in
range
(
max_a_len
):
idxs
=
list
(
get_ngram_set
(
t
,
i
+
1
)[
a
])
if
tokenizer
.
_token_end_id
not
in
idxs
:
idxs
.
append
(
tokenizer
.
_token_end_id
)
# pi是将passage以外的token的概率置零
pi
=
torch
.
zeros_like
(
p
[
i
])
pi
[
idxs
]
=
p
[
i
,
idxs
]
a
=
a
+
(
pi
.
argmax
().
item
(),)
score
+=
pi
.
max
().
item
()
if
a
[
-
1
]
==
tokenizer
.
_token_end_id
:
break
score
=
score
/
(
i
+
1
)
a
=
tokenizer
.
decode
(
a
)
if
a
:
results
[
a
]
=
results
.
get
(
a
,
[])
+
[
score
]
results
=
{
k
:
(
np
.
array
(
v
)
**
2
).
sum
()
/
(
sum
(
v
)
+
1
)
for
k
,
v
in
results
.
items
()
}
return
results
def
max_in_dict
(
d
):
if
d
:
return
sorted
(
d
.
items
(),
key
=
lambda
s
:
-
s
[
1
])[
0
][
0
]
def
predict_to_file
(
data
,
filename
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q_text
=
d
[
'question'
]
p_texts
=
[
p
[
'passage'
]
for
p
in
d
[
'passages'
]]
a
=
gen_answer
(
q_text
,
p_texts
)
a
=
max_in_dict
(
a
)
if
a
:
s
=
u
'%s
\t
%s
\n
'
%
(
d
[
'id'
],
a
)
else
:
s
=
u
'%s
\t\n
'
%
(
d
[
'id'
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
# predict_to_file(valid_data, 'qa.csv')
examples/seq2seq/task_reading_comprehension_by_seq2seq.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 用seq2seq的方式做阅读理解任务
# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
re
# 基本参数
max_p_len
=
256
max_q_len
=
64
max_a_len
=
32
max_qa_len
=
max_q_len
+
max_a_len
batch_size
=
8
epochs
=
10
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
process_data
():
if
os
.
path
.
exists
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
):
return
# 标注数据
webqa_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/WebQA.json'
,
encoding
=
'utf-8'
))
sogou_data
=
json
.
load
(
open
(
'F:/Projects/data/corpus/qa/SogouQA.json'
,
encoding
=
'utf-8'
))
# 保存一个随机序(供划分valid用)
random_order
=
list
(
range
(
len
(
sogou_data
)))
np
.
random
.
seed
(
2022
)
np
.
random
.
shuffle
(
random_order
)
# 划分valid
train_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
!=
0
]
valid_data
=
[
sogou_data
[
j
]
for
i
,
j
in
enumerate
(
random_order
)
if
i
%
3
==
0
]
train_data
.
extend
(
train_data
)
train_data
.
extend
(
webqa_data
)
# 将SogouQA和WebQA按2:1的比例混合
json
.
dump
(
train_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
json
.
dump
(
valid_data
,
open
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
,
'w'
,
encoding
=
'utf-8'
),
indent
=
4
)
process_data
()
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
file_path
):
return
json
.
load
(
open
(
file_path
))
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式: [CLS]篇章[SEP]问题[SEP]答案[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
D
in
batch
:
question
=
D
[
'question'
]
answers
=
[
p
[
'answer'
]
for
p
in
D
[
'passages'
]
if
p
[
'answer'
]]
passage
=
np
.
random
.
choice
(
D
[
'passages'
])[
'passage'
]
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
final_answer
=
''
for
answer
in
answers
:
if
all
([
a
in
passage
[:
max_p_len
-
2
]
for
a
in
answer
.
split
(
' '
)]):
final_answer
=
answer
.
replace
(
' '
,
','
)
break
qa_token_ids
,
qa_segment_ids
=
tokenizer
.
encode
(
question
,
final_answer
,
maxlen
=
max_qa_len
+
1
)
p_token_ids
,
p_segment_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
+
1
)
token_ids
=
p_token_ids
+
qa_token_ids
[
1
:]
segment_ids
=
p_segment_ids
+
qa_segment_ids
[
1
:]
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/qa/CIPS-SOGOU/valid_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
ReadingComprehension
(
AutoRegressiveDecoder
):
"""beam search解码来生成答案
passages为多篇章组成的list,从多篇文章中自动决策出最优的答案,
如果没答案,则返回空字符串。
mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。
"""
def
__init__
(
self
,
mode
=
'extractive'
,
**
kwargs
):
super
(
ReadingComprehension
,
self
).
__init__
(
**
kwargs
)
self
.
mode
=
mode
def
get_ngram_set
(
self
,
x
,
n
):
"""生成ngram合集,返回结果格式是:
{(n-1)-gram: set([n-gram的第n个字集合])}
"""
result
=
{}
for
i
in
range
(
len
(
x
)
-
n
+
1
):
k
=
tuple
(
x
[
i
:
i
+
n
])
if
k
[:
-
1
]
not
in
result
:
result
[
k
[:
-
1
]]
=
set
()
result
[
k
[:
-
1
]].
add
(
k
[
-
1
])
return
result
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'probas'
,
use_states
=
True
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
inputs
=
[
i
for
i
in
inputs
if
i
[
0
,
0
].
item
()
>
-
1
]
# 过滤掉无答案篇章
topk
=
len
(
inputs
[
0
])
all_token_ids
,
all_segment_ids
=
[],
[]
for
token_ids
in
inputs
:
# inputs里每个元素都代表一个篇章
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
zeros_like
(
token_ids
)
if
states
>
0
:
segment_ids
[:,
-
output_ids
.
shape
[
1
]:]
=
1
all_token_ids
.
extend
(
token_ids
)
all_segment_ids
.
extend
(
segment_ids
)
padded_all_token_ids
=
sequence_padding
(
all_token_ids
)
padded_all_segment_ids
=
sequence_padding
(
all_segment_ids
)
_
,
logits
=
model
.
predict
([
padded_all_token_ids
,
padded_all_segment_ids
])
probas
=
nn
.
Softmax
(
dim
=-
1
)(
logits
)
# 这里改成用torch.gather来做了
# probas = [probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids)]
# probas = torch.stack(probas).reshape((len(inputs), topk, -1))
index_
=
torch
.
tensor
([[
len
(
i
)
-
1
]
for
i
in
all_token_ids
],
device
=
probas
.
device
).
view
(
-
1
,
1
,
1
).
expand
(
-
1
,
1
,
probas
.
shape
[
-
1
])
probas
=
torch
.
gather
(
probas
,
dim
=
1
,
index
=
index_
).
reshape
((
len
(
inputs
),
topk
,
-
1
))
if
states
==
0
:
# 这一步主要是排除没有答案的篇章
# 如果一开始最大值就为end_id,那说明该篇章没有答案
argmax
=
probas
[:,
0
].
argmax
(
dim
=
1
)
available_idxs
=
torch
.
where
(
argmax
!=
self
.
end_id
)[
0
]
if
len
(
available_idxs
)
==
0
:
scores
=
torch
.
zeros_like
(
probas
[
0
])
scores
[:,
self
.
end_id
]
=
1
return
scores
,
states
+
1
else
:
for
i
in
torch
.
where
(
argmax
==
self
.
end_id
)[
0
]:
inputs
[
i
][:,
0
]
=
-
1
# 无答案篇章首位标记为-1
probas
=
probas
[
available_idxs
]
inputs
=
[
i
for
i
in
inputs
if
i
[
0
,
0
]
>
-
1
]
# 过滤掉无答案篇章
if
self
.
mode
==
'extractive'
:
# 如果是抽取式,那么答案必须是篇章的一个片段
# 那么将非篇章片段的概率值全部置0
new_probas
=
torch
.
zeros_like
(
probas
)
ngrams
=
{}
for
token_ids
in
inputs
:
token_ids
=
token_ids
[
0
]
sep_idx
=
torch
.
where
(
token_ids
==
tokenizer
.
_token_end_id
)[
0
][
0
]
p_token_ids
=
token_ids
[
1
:
sep_idx
]
for
k
,
v
in
self
.
get_ngram_set
(
p_token_ids
.
cpu
().
numpy
(),
states
+
1
).
items
():
# 这里要放到.cpu().numpy(),否则会出现nrams.get不到
ngrams
[
k
]
=
ngrams
.
get
(
k
,
set
())
|
v
for
i
,
ids
in
enumerate
(
output_ids
):
available_idxs
=
ngrams
.
get
(
tuple
(
ids
.
cpu
().
numpy
()),
set
())
available_idxs
.
add
(
tokenizer
.
_token_end_id
)
available_idxs
=
list
(
available_idxs
)
new_probas
[:,
i
,
available_idxs
]
=
probas
[:,
i
,
available_idxs
]
probas
=
new_probas
return
(
probas
**
2
).
sum
(
0
)
/
(
probas
.
sum
(
0
)
+
1
),
states
+
1
# 某种平均投票方式
def
answer
(
self
,
question
,
passages
,
topk
=
1
):
token_ids
=
[]
for
passage
in
passages
:
passage
=
re
.
sub
(
u
' |、|;|,'
,
','
,
passage
)
p_token_ids
=
tokenizer
.
encode
(
passage
,
maxlen
=
max_p_len
)[
0
]
q_token_ids
=
tokenizer
.
encode
(
question
,
maxlen
=
max_q_len
+
1
)[
0
]
token_ids
.
append
(
p_token_ids
+
q_token_ids
[
1
:])
output_ids
=
self
.
beam_search
(
token_ids
,
topk
=
topk
,
states
=
0
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
reader
=
ReadingComprehension
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_a_len
,
mode
=
'extractive'
,
device
=
device
)
def
predict_to_file
(
data
,
filename
,
topk
=
1
):
"""将预测结果输出到文件,方便评估
"""
with
open
(
filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
for
d
in
tqdm
(
iter
(
data
),
desc
=
u
'正在预测(共%s条样本)'
%
len
(
data
)):
q_text
=
d
[
'question'
]
p_texts
=
[
p
[
'passage'
]
for
p
in
d
[
'passages'
]]
a
=
reader
.
answer
(
q_text
,
p_texts
,
topk
)
if
a
:
s
=
u
'%s
\t
%s
\n
'
%
(
d
[
'id'
],
a
)
else
:
s
=
u
'%s
\t\n
'
%
(
d
[
'id'
])
f
.
write
(
s
)
f
.
flush
()
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
predict_to_file
(
valid_dataset
.
data
[:
100
],
'qa.csv'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
predict_to_file
(
valid_dataset
.
data
,
'qa.csv'
)
examples/seq2seq/task_seq2seq_ape210k_math_word_problem.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 用Seq2Seq做小学数学应用题
# 数据集为ape210k:https://github.com/Chenny0808/ape210k
# 介绍链接:https://kexue.fm/archives/7809
from
__future__
import
division
import
json
,
re
from
tqdm
import
tqdm
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
torch
import
nn
,
optim
import
torch
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.snippets
import
AutoRegressiveDecoder
from
sympy
import
Integer
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
# 基本参数
maxlen
=
192
batch_size
=
16
epochs
=
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[hit_torch_base]--chinese-bert-wwm-ext/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
is_equal
(
a
,
b
):
"""比较两个结果是否相等
"""
a
=
round
(
float
(
a
),
6
)
b
=
round
(
float
(
b
),
6
)
return
a
==
b
def
remove_bucket
(
equation
):
"""去掉冗余的括号
"""
l_buckets
,
buckets
=
[],
[]
for
i
,
c
in
enumerate
(
equation
):
if
c
==
'('
:
l_buckets
.
append
(
i
)
elif
c
==
')'
:
buckets
.
append
((
l_buckets
.
pop
(),
i
))
eval_equation
=
eval
(
equation
)
for
l
,
r
in
buckets
:
new_equation
=
'%s %s %s'
%
(
equation
[:
l
],
equation
[
l
+
1
:
r
],
equation
[
r
+
1
:])
try
:
if
is_equal
(
eval
(
new_equation
.
replace
(
' '
,
''
)),
eval_equation
):
equation
=
new_equation
except
:
pass
return
equation
.
replace
(
' '
,
''
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取训练数据,并做一些标准化,保证equation是可以eval的
参考:https://kexue.fm/archives/7809
"""
D
=
[]
for
l
in
open
(
filename
,
'r'
,
encoding
=
'utf-8'
):
l
=
json
.
loads
(
l
)
question
,
equation
,
answer
=
l
[
'original_text'
],
l
[
'equation'
],
l
[
'ans'
]
# 处理带分数
question
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
question
)
equation
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
equation
)
answer
=
re
.
sub
(
'(\d+)\((\d+/\d+)\)'
,
'(
\\
1+
\\
2)'
,
answer
)
equation
=
re
.
sub
(
'(\d+)\('
,
'
\\
1+('
,
equation
)
answer
=
re
.
sub
(
'(\d+)\('
,
'
\\
1+('
,
answer
)
# 分数去括号
question
=
re
.
sub
(
'\((\d+/\d+)\)'
,
'
\\
1'
,
question
)
# 处理百分数
equation
=
re
.
sub
(
'([\.\d]+)%'
,
'(
\\
1/100)'
,
equation
)
answer
=
re
.
sub
(
'([\.\d]+)%'
,
'(
\\
1/100)'
,
answer
)
# 冒号转除号、剩余百分号处理
equation
=
equation
.
replace
(
':'
,
'/'
).
replace
(
'%'
,
'/100'
)
answer
=
answer
.
replace
(
':'
,
'/'
).
replace
(
'%'
,
'/100'
)
if
equation
[:
2
]
==
'x='
:
equation
=
equation
[
2
:]
try
:
if
is_equal
(
eval
(
equation
),
eval
(
answer
)):
D
.
append
((
question
,
remove_bucket
(
equation
),
answer
))
except
:
continue
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
question
,
equation
,
answer
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
question
,
equation
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/ape210k/train.ape.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/ape210k/valid.ape.json'
)
# valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# test_dataloader = DataLoader(MyDataset('F:/Projects/data/corpus/seq2seq/ape210k/test.ape.json'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
AutoSolve
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
()).
replace
(
' '
,
''
)
autosolve
=
AutoSolve
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
64
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_acc
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
200
])
# 评测模型
if
metrics
[
'acc'
]
>=
self
.
best_acc
:
self
.
best_acc
=
metrics
[
'acc'
]
# model.save_weights('./best_model_math.pt') # 保存模型
metrics
[
'best_acc'
]
=
self
.
best_acc
print
(
'valid_data:'
,
metrics
)
print
()
def
evaluate
(
self
,
data
,
topk
=
1
):
total
,
right
=
0.0
,
0.0
for
question
,
equation
,
answer
in
tqdm
(
data
,
desc
=
'Evaluate'
):
total
+=
1
pred_equation
=
autosolve
.
generate
(
question
,
topk
)
try
:
right
+=
int
(
is_equal
(
eval
(
pred_equation
),
eval
(
answer
)))
except
:
pass
return
{
'acc'
:
right
/
total
}
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
500
,
epochs
=
epochs
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'./best_model.weights'
)
\ No newline at end of file
examples/seq2seq/task_seq2seq_autotitle.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
import
torch
from
torchinfo
import
summary
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
glob
# 基本参数
maxlen
=
256
batch_size
=
16
epochs
=
10000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
txt
in
batch
:
text
=
open
(
txt
,
encoding
=
'utf-8'
).
read
()
text
=
text
.
split
(
'
\n
'
)
if
len
(
text
)
>
1
:
title
=
text
[
0
]
content
=
'
\n
'
.
join
(
text
[
1
:])
token_ids
,
segment_ids
=
tokenizer
.
encode
(
content
,
title
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
ListDataset
(
glob
.
glob
(
'F:/Projects/data/corpus/sentence_classification/THUCNews/*/*.txt'
)),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, vocab_size]
targets: y_true, y_segment
unilm式样,需要手动把非seq2seq部分mask掉
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
,
topp
=
0.95
):
max_c_len
=
maxlen
-
self
.
maxlen
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
def
just_show
():
s1
=
u
'夏天来临,皮肤在强烈紫外线的照射下,晒伤不可避免,因此,晒后及时修复显得尤为重要,否则可能会造成长期伤害。专家表示,选择晒后护肤品要慎重,芦荟凝胶是最安全,有效的一种选择,晒伤严重者,还请及 时 就医 。'
s2
=
u
'8月28日,网络爆料称,华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看,数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录,住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证,认为数据真实性非常高。当天下午 ,华 住集 团发声明称,已在内部迅速开展核查,并第一时间报警。当晚,上海警方消息称,接到华住集团报案,警方已经介入调查。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
()
if
__name__
==
'__main__'
:
just_show
()
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
100
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/seq2seq/task_seq2seq_autotitle_csl_bart.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用BART方案
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
ListDataset
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
from
tqdm
import
tqdm
import
json
from
rouge
import
Rouge
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
valid_len
=
1000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
(
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'bart'
,
keep_tokens
=
keep_tokens
,
segment_vocab_size
=
0
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1.5e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
valid_len
])
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
just_show
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/seq2seq/task_seq2seq_autotitle_csl_mt5.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 微调多国语言版T5做Seq2Seq任务
# 介绍链接:https://kexue.fm/archives/7867
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
# mt5主要特点:gated-gelu, decoder的最后的dense层独立权重,rmsnorm
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
SpTokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
valid_len
=
1000
token_pad_ids
=
-
100
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_torch_base]/pytorch_model.bin'
# 下面两个config是从bert4keras中拿的,项目连接https://github.com/bojone/t5_in_bert4keras
spm_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn.model'
keep_tokens_path
=
'F:/Projects/pretrain_ckpt/t5/[google_mt5_bert4keras]/sentencepiece_cn_keep_tokens.json'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
tokenizer
=
SpTokenizer
(
spm_path
,
token_start
=
None
,
token_end
=
'</s>'
)
keep_tokens
=
json
.
load
(
open
(
keep_tokens_path
))
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
([
0
]
+
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
,
value
=
token_pad_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
,
value
=
token_pad_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'mt5.1.1'
,
segment_vocab_size
=
0
,
attention_scale
=
False
,
is_dropout
=
True
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
tie_emb_prj_weight
=
False
,
# 独立权重
token_pad_ids
=
token_pad_ids
,
# 也可以指定custom_attention_mask并传入attention_mask来实现
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
token_pad_ids
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
([
int
(
i
)
for
i
in
output_ids
.
cpu
().
numpy
()])
autotitle
=
AutoTitle
(
start_id
=
0
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
valid_len
])
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
just_show
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment