Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
0e29b9b7
Commit
0e29b9b7
authored
Jan 20, 2026
by
xuxo
Browse files
yidong infer init
parents
Pipeline
#3252
failed with stages
in 0 seconds
Changes
150
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2564 additions
and
0 deletions
+2564
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_hierarchical_position.py
...on/task_sentiment_classification_hierarchical_position.py
+109
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_nezha.py
...ence_classfication/task_sentiment_classification_nezha.py
+119
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_roformer.py
...e_classfication/task_sentiment_classification_roformer.py
+121
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_roformer_v2.py
...lassfication/task_sentiment_classification_roformer_v2.py
+115
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_xlnet.py
...ence_classfication/task_sentiment_classification_xlnet.py
+121
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/README.md
...rch_cmcc/examples/sentence_embedding/FinanceFAQ/README.md
+37
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/config.py
...rch_cmcc/examples/sentence_embedding/FinanceFAQ/config.py
+27
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step1_0.ipynb
...nanceFAQ/task_sentence_embedding_FinanceFAQ_step1_0.ipynb
+272
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step1_1.py
.../FinanceFAQ/task_sentence_embedding_FinanceFAQ_step1_1.py
+205
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step2_0.ipynb
...nanceFAQ/task_sentence_embedding_FinanceFAQ_step2_0.ipynb
+158
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step2_1.py
.../FinanceFAQ/task_sentence_embedding_FinanceFAQ_step2_1.py
+142
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step3_inference.ipynb
.../task_sentence_embedding_FinanceFAQ_step3_inference.ipynb
+81
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step3_predict.ipynb
...AQ/task_sentence_embedding_FinanceFAQ_step3_predict.ipynb
+199
-0
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/utils.py
...orch_cmcc/examples/sentence_embedding/FinanceFAQ/utils.py
+119
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_DimensionalityReduction.py
...edding/task_sentence_embedding_DimensionalityReduction.py
+50
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_model_distillation.py
...e_embedding/task_sentence_embedding_model_distillation.py
+64
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_CoSENT.py
.../sentence_embedding/task_sentence_embedding_sup_CoSENT.py
+145
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_ContrastiveLoss.py
..._embedding/task_sentence_embedding_sup_ContrastiveLoss.py
+139
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_CosineMSELoss.py
...ce_embedding/task_sentence_embedding_sup_CosineMSELoss.py
+137
-0
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
...sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
+204
-0
No files found.
Too many changes to show.
To preserve performance only
150 of 150+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_hierarchical_position.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 情感分类例子,超长文本采用hierarchical_position层次分解位置编码,spaces.ac.cn/archives/7947
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
maxlen
=
1024
batch_size
=
3
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
# 这里指定了hierarchical_position和max_position,把原有的position
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
hierarchical_position
=
True
,
max_position
=
1024
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
_
,
pooled_output
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_nezha.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 情感分类例子,加载nezha权重
# valid_acc: 95.07, test_acc: 94.72
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
random
import
os
import
numpy
as
np
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/nezha/[huawei_noah_torch_base]--nezha-cn-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
# 指定好model=nezha和对应的ckpt地址
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'nezha'
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
,
segment_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_roformer.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 情感分类例子,RoPE相对位置编码
# 官方项目:https://github.com/ZhuiyiTechnology/roformer
# pytorch参考项目:https://github.com/JunnYu/RoFormer_pytorch
# valid_acc: 94.85, test_acc: 94.42
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
random
import
os
import
numpy
as
np
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
# 指定好model和对应的ckpt地址
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer'
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
,
segment_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_roformer_v2.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 情感分类例子,RoPE相对位置编码
# 官方项目:https://github.com/ZhuiyiTechnology/roformer-v2
# pytorch参考项目:https://github.com/JunnYu/RoFormer_pytorch
# valid_acc: 95.78, test_acc: 96.09
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
# 指定好model和对应的ckpt地址
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'roformer_v2'
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
last_hidden_state
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
last_hidden_state
[:,
0
,
:])
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_xlnet.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 情感分类任务: xlnet
# transformer包中tokenizer是padding在前面的
# 这里可以使用transformer的tokenizer,也可以使用SpTokenizer,注意取最后一位时候取非padding的最后一位
# valid_acc: 95.00, test_acc: 94.24
from
bert4torch.tokenizers
import
SpTokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
import
random
,
os
,
numpy
as
np
from
torch.utils.data
import
DataLoader
maxlen
=
256
batch_size
=
16
pretrain_model
=
'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base/'
config_path
=
pretrain_model
+
'bert4torch_config.json'
checkpoint_path
=
pretrain_model
+
'pytorch_model.bin'
spm_path
=
pretrain_model
+
'spiece.model'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
SpTokenizer
(
spm_path
,
token_start
=
None
,
token_end
=
'<cls>'
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
# 用tokenizer的pad_id来做padding
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
value
=
tokenizer
.
_token_pad_id
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'xlnet'
,
token_pad_ids
=
tokenizer
.
_token_pad_id
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
768
,
2
)
def
forward
(
self
,
token_ids
):
last_hidden_state
=
self
.
bert
([
token_ids
])
# 取最后一位<cls>位的隐含层状态
last_token_idx
=
token_ids
.
not_equal
(
tokenizer
.
_token_pad_id
).
sum
(
dim
=
1
)
-
1
last_token_idx
=
last_token_idx
[:,
None
,
None
].
expand
(
last_hidden_state
.
shape
[
0
],
1
,
last_hidden_state
.
shape
[
-
1
])
pooling
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
last_token_idx
).
squeeze
(
1
)
output
=
self
.
dropout
(
pooling
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/README.md
0 → 100644
View file @
0e29b9b7
# 召回+排序两阶段模型
金融场景FAQ解决方案
## 建模思路
1.
阶段一:MultiNegativeRankingLoss来做有监督的语义相似度任务
2.
利用1阶段训练好的模型,用向量相似度为所有的相似问q_sim召回最相近的K个标准问q_std_pred, 其中等于q_std的为正样本,不等于的为困难负样本
3.
阶段二:ContrastiveLoss来做有监督的语义相似度任务
4.
预测:一个query通过阶段一模型找到topK个标问q_std, 然后通过阶段二模型从topK个标问中找到最可能的标问
## 优缺点分析
-
阶段一的训练自动为阶段二模型构造困难样本,类似于Boosting的思想,进一步提升准确率
## 文件说明
| 文件名 | 文件描述 |
| ---- | ---- |
| task_sentence_embedding_FinanceFAQ_step1_0.ipynb | 阶段一模型数据生成 |
| task_sentence_embedding_FinanceFAQ_step1_1.py | 阶段一模型训练 |
| task_sentence_embedding_FinanceFAQ_step2_0.ipynb | 阶段二模型数据生成 |
| task_sentence_embedding_FinanceFAQ_step2_1.ipynb | 阶段二模型训练 |
| task_sentence_embedding_FinanceFAQ_step3_predict.ipynb | 模型效果评估 |
| task_sentence_embedding_FinanceFAQ_step3_inference.ipynb | 单条样本推理 |
## 指标
-
评测数据集:所有标问相似问pair(样本内)
-
指标:recall(正确标问在召回的TopK中的比例)
| 阶段 | Top1 | Top3 | Top5 | Top10 |
|----|----|----|----|----|
|一阶段raw方式|91.32|97.94|98.91|99.57|
|一阶段random方式|88.19|95.93|97.56|98.82|
|一阶段mul_ce方式|90.32|97.51|98.67|99.44|
|二阶段|98.00|99.47|99.79|100|
|一阶段raw方式+二阶段整体|97.54|99.00|99.33|99.50|
## requirements
transformers==4.15.0
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/config.py
0 → 100644
View file @
0e29b9b7
# 模型文件地址
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
data_dir
=
'F:/Projects/data/corpus/qa/FinanceFAQ'
q_std_file
=
f
'
{
data_dir
}
/q_std_file.tsv'
# 标准问数据
q_corpus_file
=
f
'
{
data_dir
}
/q_corpus_file.tsv'
# 所有语料数据
q_sim_file
=
f
'
{
data_dir
}
/q_sim_file.tsv'
# 一阶段训练
fst_train_file
=
f
'
{
data_dir
}
/fst_train.tsv'
fst_dev_file
=
f
'
{
data_dir
}
/fst_dev.tsv'
ir_path
=
f
'
{
data_dir
}
/fst_ir_corpus.tsv'
fst_q_std_vectors_file
=
f
'
{
data_dir
}
/fst_q_std_vectors_file.npy'
fst_q_corpus_vectors_file
=
f
'
{
data_dir
}
/fst_q_corpus_vectors_file.npy'
fst_std_data_results
=
f
'
{
data_dir
}
/fst_std_data_results.tsv'
fst_eval_path_list
=
[
f
'
{
data_dir
}
/fst_eval.tsv'
]
# 二阶段
sec_train_file
=
f
'
{
data_dir
}
/sec_train_file.tsv'
sec_dev_file
=
f
'
{
data_dir
}
/sec_dev_file.tsv'
sec_test_file
=
f
'
{
data_dir
}
/sec_test_file.tsv'
sec_q_std_vectors_file
=
f
'
{
data_dir
}
/sec_q_std_vectors_file.npy'
sec_q_corpus_vectors_file
=
f
'
{
data_dir
}
/sec_q_corpus_vectors_file.npy'
sec_eval_path_list
=
[]
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step1_0.ipynb
0 → 100644
View file @
0e29b9b7
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"source": [
"from config import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"q_std_map = pd.read_csv('F:/Projects/data/corpus/qa/FinanceFAQ/input/q_std.tsv', sep='\\t', encoding='utf-8')['0'].to_dict()\n",
"query_pair = pd.read_csv('F:/Projects/data/corpus/qa/FinanceFAQ/input/query_pair_0.tsv', sep='\\t', encoding='utf-8')\n",
"query_pair['q_std'] = query_pair['q_std'].map(q_std_map)\n",
"query_pair.to_csv(fst_train_file, sep='\\t', encoding='utf-8', index=False)\n",
"query_pair.iloc[5:9]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>q_std</th>\n",
" <th>q_sim</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>现金利能否直接购买股票</td>\n",
" <td>就是说现金利是可以卖股票的对吗</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>损益表的介绍</td>\n",
" <td>损益表是啥</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>#股票名称#季度报告</td>\n",
" <td>详细说下600338第一季报吧</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>未成交的介绍</td>\n",
" <td>需要知道未成交是什么</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" q_std q_sim\n",
"5 现金利能否直接购买股票 就是说现金利是可以卖股票的对吗\n",
"6 损益表的介绍 损益表是啥\n",
"7 #股票名称#季度报告 详细说下600338第一季报吧\n",
"8 未成交的介绍 需要知道未成交是什么"
]
},
"metadata": {},
"execution_count": 9
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"query_pair.q_sim.str.len().describe()"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"count 50000.000000\n",
"mean 18.549940\n",
"std 7.961594\n",
"min 1.000000\n",
"25% 13.000000\n",
"50% 18.000000\n",
"75% 23.000000\n",
"max 108.000000\n",
"Name: q_sim, dtype: float64"
]
},
"metadata": {},
"execution_count": 10
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 11,
"source": [
"# 为每个q_std随机选择一个q_sim作为dev集\n",
"q_std_list = query_pair['q_std'].unique().tolist()\n",
"query_pair['test_rnd'] = query_pair.q_std.apply(lambda x: np.random.rand())\n",
"query_pair['nrank_test'] = query_pair.groupby('q_std')['test_rnd'].rank(ascending=0, method='first')\n",
"dev_query_pair = query_pair[query_pair.nrank_test<=1][['q_std', 'q_sim']]\n",
"dev_query_pair.head(5)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>q_std</th>\n",
" <th>q_sim</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>回档的介绍</td>\n",
" <td>回档是什么东西</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>合约未了结情况下,卖出担保品时,提示零股不支持交易的解决方式</td>\n",
" <td>合约未了结情况下,卖出担保品时,解决提醒零股不能买卖的方式可能是什么</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>任职基金数大于#数字实体#的基金经理有哪些</td>\n",
" <td>想知道任职基金数大于50的基金经理有哪些</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>市销率大于#数字实体#的行业有哪些</td>\n",
" <td>我来咨询看看市销率大于100行业都有啥</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>竞价涨幅不小于#数字实体#的#地域板块#股票有哪些</td>\n",
" <td>给我发下都有啥内蒙板块股票是竞价涨幅不少于50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" q_std q_sim\n",
"1 回档的介绍 回档是什么东西\n",
"11 合约未了结情况下,卖出担保品时,提示零股不支持交易的解决方式 合约未了结情况下,卖出担保品时,解决提醒零股不能买卖的方式可能是什么\n",
"15 任职基金数大于#数字实体#的基金经理有哪些 想知道任职基金数大于50的基金经理有哪些\n",
"16 市销率大于#数字实体#的行业有哪些 我来咨询看看市销率大于100行业都有啥\n",
"31 竞价涨幅不小于#数字实体#的#地域板块#股票有哪些 给我发下都有啥内蒙板块股票是竞价涨幅不少于50"
]
},
"metadata": {},
"execution_count": 11
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 12,
"source": [
"# 为所有的query配一个qid\n",
"q_std_dev = dev_query_pair.q_std.unique().tolist()\n",
"q_sim_dev = dev_query_pair.q_sim.unique().tolist()\n",
"q_qid = q_std_dev + q_sim_dev\n",
"q_qid = list(set(q_qid))\n",
"q_qid_dict = {i+1:q_qid[i] for i in range(0, len(q_qid))} # {id: query}\n",
"q_qid_dict_inv = {v: k for k, v in q_qid_dict.items()} # {query: id}\n",
"\n",
"# 建立ir_corpus: [q_id, q_std]的映射\n",
"ir_corpus = {q_qid_dict_inv[v]: v for v in q_std_list if v not in q_sim_dev}\n",
"ir_corpus_df = pd.DataFrame(list(ir_corpus.items()), columns=['qid', 'question']).sort_values('qid').reset_index(drop=True)\n",
"ir_corpus_df.to_csv(ir_path, sep='\\t', index=False)\n",
"\n",
"# 保存dev\n",
"dev_query_pair['qid'] = dev_query_pair.q_sim.map(q_qid_dict_inv)\n",
"dev_query_pair['duplicate_qids'] = dev_query_pair.q_std.map(q_qid_dict_inv)\n",
"dev_query_pair.duplicate_qids = dev_query_pair.duplicate_qids.astype('str')\n",
"dev_query_pair = dev_query_pair.groupby(['q_sim', 'qid']).apply(lambda v: ','.join(v['duplicate_qids'])).reset_index(name='duplicate_qids')[['qid', 'q_sim', 'duplicate_qids']]\n",
"dev_query_pair.to_csv(fst_dev_file, sep='\\t', index=False)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"print('读取数据集并分别保存标问、相似问、所有语料: ', fst_train_file)\n",
"std_data = pd.read_csv(fst_train_file, sep=\"\\t\")\n",
"q_std_list = std_data.q_std.unique().tolist() # 标准问list\n",
"q_sim_list = std_data.q_sim.unique().tolist() # 相似问list\n",
"q_corpus = list(set(q_std_list + q_sim_list))\n",
"\n",
"q_std_df = pd.DataFrame(q_std_list, columns=['q'])\n",
"q_corpus_df = pd.DataFrame(q_corpus, columns=['q'])\n",
"q_sim_df = pd.DataFrame(q_sim_list, columns=['q'])\n",
"\n",
"q_std_df.to_csv(q_std_file, index=None, header=False, sep=\"\\t\")\n",
"q_corpus_df.to_csv(q_corpus_file, index=None, header=False, sep=\"\\t\")\n",
"q_sim_df.to_csv(q_sim_file, index=None, header=False, sep=\"\\t\")\n",
"\n",
"print('q_std_list:——>', len(q_std_list), 'q_sim_list:——>', len(q_sim_list), 'q_corpus:——>', len(q_corpus))"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step1_1.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# loss: MultiNegativeRankingLoss, 和simcse一样,以batch中其他样本作为负样本
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
TensorDataset
from
tqdm
import
tqdm
from
sentence_transformers
import
evaluation
from
config
import
config_path
,
checkpoint_path
,
dict_path
,
fst_train_file
,
fst_dev_file
,
ir_path
import
numpy
as
np
import
pandas
as
pd
import
random
import
os
# 固定seed
seed_everything
(
42
)
maxlen
=
64
batch_size
=
64
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# raw: 原始的版本
# random: 同一个标准问(组内)随机采样,组间互为负样本
# mul_ce: 原始版本修改版,组间也有正样本(标准问一致的时候)
choice
=
'mul_ce'
print
(
f
'using
{
choice
}
mode in step1 model'
.
center
(
60
,
'-'
))
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
if
choice
in
{
'raw'
,
'mul_ce'
}:
# 原始模式,可能同一个batch中会出现重复标问
def
collate_fn
(
batch
):
if
choice
==
'raw'
:
labels
=
torch
.
arange
(
len
(
batch
),
device
=
device
)
else
:
labels
=
torch
.
eye
(
len
(
batch
),
dtype
=
torch
.
long
,
device
=
device
)
# 定位相同元素
for
i
,
(
q_std1
,
_
)
in
enumerate
(
batch
):
for
j
,
(
q_std2
,
_
)
in
enumerate
(
batch
[
i
+
1
:],
start
=
i
+
1
):
if
q_std1
==
q_std2
:
labels
[
i
,
j
]
=
1
labels
[
j
,
i
]
=
1
texts_list
=
[[]
for
_
in
range
(
2
)]
for
texts
in
batch
:
for
i
,
text
in
enumerate
(
texts
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
i
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
return
texts_list
,
labels
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
row
,
l
in
enumerate
(
f
):
if
row
==
0
:
# 跳过首行
continue
q_std
,
q_sim
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
q_std
.
replace
(
' '
,
''
),
q_sim
.
replace
(
' '
,
''
)))
return
D
elif
choice
==
'random'
:
# 以标准问为key的键值对, 保证一个batch内不存在同样q_std的样本
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
2
)]
for
text_list
in
batch
:
# q_std有0.5的概率被抽样到
p
=
[
0.5
]
+
[
0.5
/
(
len
(
text_list
)
-
1
)]
*
(
len
(
text_list
)
-
1
)
texts
=
np
.
random
.
choice
(
text_list
,
2
,
replace
=
False
,
p
=
p
)
for
i
,
text
in
enumerate
(
texts
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
i
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
dict
()
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
row
,
l
in
enumerate
(
f
):
if
row
==
0
:
# 跳过首行
continue
q_std
,
q_sim
=
l
.
strip
().
split
(
'
\t
'
)
q_std
=
q_std
.
replace
(
' '
,
''
)
q_sim
=
q_sim
.
replace
(
' '
,
''
)
D
[
q_std
]
=
D
.
get
(
q_std
,
[])
+
[
q_sim
]
return
[[
k
]
+
v
for
k
,
v
in
D
.
items
()]
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pool_cls1
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz*2]
return
scores
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
def
encode
(
self
,
texts
,
**
kwargs
):
token_ids_list
=
[]
for
text
in
texts
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids_list
.
append
(
token_ids
)
token_ids_tensor
=
torch
.
tensor
(
sequence_padding
(
token_ids_list
),
dtype
=
torch
.
long
)
valid_dataloader
=
DataLoader
(
TensorDataset
(
token_ids_tensor
),
batch_size
=
batch_size
)
valid_sen_emb
=
[]
self
.
eval
()
for
token_ids
in
tqdm
(
valid_dataloader
,
desc
=
'Evaluate'
):
token_ids
=
token_ids
[
0
].
to
(
device
)
output
=
self
.
predict
(
token_ids
)
valid_sen_emb
.
append
(
output
.
cpu
())
valid_sen_emb
=
torch
.
cat
(
valid_sen_emb
,
dim
=
0
)
return
valid_sen_emb
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
().
to
(
device
)
# 多分类
class
Myloss
(
nn
.
Module
):
def
forward
(
self
,
y_pred
,
y_true
):
y_pred
=
torch
.
log
(
torch
.
softmax
(
y_pred
,
dim
=-
1
))
*
y_true
# [btz, btz]
return
-
y_pred
.
sum
()
/
len
(
y_pred
)
# y_pred_pos = (y_pred * y_true).sum(dim=-1)
# y_pred_sum = torch.logsumexp(y_pred, dim=-1)
# return (y_pred_sum - y_pred_pos).sum() / len(y_pred)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
Myloss
()
if
choice
==
'mul_ce'
else
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
def
__init__
(
self
):
super
().
__init__
()
self
.
best_perf
=
0
def
on_dataloader_end
(
self
,
logs
=
None
):
model
.
train_dataloader
=
DataLoader
(
MyDataset
(
fst_train_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
perf
=
evaluate
(
model
,
epoch
=
model
.
epoch
,
steps
=
model
.
global_step
,
output_path
=
'./'
)
if
perf
>
self
.
best_perf
:
self
.
best_perf
=
perf
model
.
save_weights
(
f
'./fst_best_weights_
{
choice
}
.pt'
)
print
(
f
'perf:
{
perf
:.
2
f
}
, best perf:
{
self
.
best_perf
:.
2
f
}
\n
'
)
if
__name__
==
'__main__'
:
# 训练集
train_dataloader
=
DataLoader
(
MyDataset
(
fst_train_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 验证集
ir_queries
,
ir_corpus
,
ir_relevant_docs
=
{},
{},
{}
with
open
(
fst_dev_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
next
(
f
)
for
line
in
f
:
qid
,
query
,
duplicate_ids
=
line
.
strip
().
split
(
'
\t
'
)
duplicate_ids
=
duplicate_ids
.
split
(
','
)
ir_queries
[
qid
]
=
query
ir_relevant_docs
[
qid
]
=
set
(
duplicate_ids
)
ir_corpus_df
=
pd
.
read_csv
(
ir_path
,
sep
=
'
\t
'
)
ir_corpus_df
.
qid
=
ir_corpus_df
.
qid
.
astype
(
'str'
)
ir_corpus
=
dict
(
zip
(
ir_corpus_df
.
qid
.
tolist
(),
ir_corpus_df
.
question
.
tolist
()))
evaluate
=
evaluation
.
InformationRetrievalEvaluator
(
ir_queries
,
ir_corpus
,
ir_relevant_docs
,
name
=
choice
)
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
100
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
f
'./fst_best_weights_
{
choice
}
.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step2_0.ipynb
0 → 100644
View file @
0e29b9b7
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 准备二阶段训练数据集\n",
"### 1. 用一阶段模型把所有query转成向量"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from config import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model\n",
"\n",
"# 读取标问和所有语料\n",
"q_std_list = pd.read_csv(q_std_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"q_corpus = pd.read_csv(q_corpus_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"\n",
"# get embeddings\n",
"q_std_sentence_embeddings = model.encode(q_std_list)\n",
"np.save(fst_q_std_vectors_file, q_std_sentence_embeddings.numpy())\n",
"q_corpus_sentence_embeddings = model.encode(q_corpus)\n",
"np.save(fst_q_corpus_vectors_file, q_corpus_sentence_embeddings.numpy())\n",
"print('标准问向量路径:', fst_q_std_vectors_file)\n",
"print('所有语料保存向量路径:', fst_q_corpus_vectors_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. 为每个q_sim找到topK的的q_std"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model\n",
"from config import *\n",
"from utils import *\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"\n",
"print('----加载一阶段训练(标问-相似问)数据集', fst_train_file)\n",
"df_eval = pd.read_csv(fst_train_file, sep=\"\\t\")\n",
"print(\"shape: \", df_eval.shape)\n",
"df_eval = df_eval[df_eval.q_std.isin(q_std_list)]\n",
"print(\"shape: \", df_eval.shape)\n",
"\n",
"df_eval = cal_performance(model, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=20)\n",
"df_eval.to_csv(fst_std_data_results, index=None, sep=\"\\t\")\n",
"df_eval.iloc[3:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. 二阶段正负样本生成\n",
"预测的topK中和q_std一致的为正样本,不一致的为困难负样本"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"xdf = df_eval.copy(deep=True)\n",
"# xdf['q_std_pred_list']=xdf.q_std_pred_list.apply(lambda v:eval(v))\n",
"print('预测结果中和q_std不一致的'.center(60, '-'))\n",
"xdf['q_std_pred_list_else'] = xdf.apply(lambda row: [v for v in row['q_std_pred_list'] if v[0] != row['q_std']], axis=1)\n",
"xdf['q_std_pred_list_else_v1'] = xdf.q_std_pred_list_else.apply(lambda v: [m[0] for m in v]) # 负样本的文本\n",
"xdf['q_std_pred_list_else_v2'] = xdf.q_std_pred_list_else.apply(lambda v: [m[1] for m in v]) # 负样本的概率\n",
"\n",
"print('组织正负样本'.center(60, '-'))\n",
"xdf['pairs'] = xdf.apply(lambda row: ['1' + '\\t' + row['q_sim'] + '\\t' + row['q_std'] + '\\t' + '1'] + [\n",
" '0' + '\\t' + row['q_sim'] + '\\t' + v[0] + '\\t' + str(v[1]) for v in row['q_std_pred_list_else'][0:10]], axis=1)\n",
"print(xdf.iloc[3]['pairs'])\n",
"\n",
"print('单独处理正负样本'.center(60, '-'))\n",
"q_sim_list = xdf.q_sim.unique().tolist()\n",
"q_std_list = xdf.q_std.unique().tolist()\n",
"q_sim_dict = {q_sim_list[i]: i for i in range(0, len(q_sim_list))}\n",
"q_std_dict = {q_std_list[i]: i for i in range(0, len(q_std_list))}\n",
"pairs = xdf.pairs.tolist()\n",
"pairs_list = [v.split('\\t') for vlist in pairs for v in vlist]\n",
"pairs_df = pd.DataFrame(pairs_list, columns=['label', 'q_sim', 'q_std', 'prob'])\n",
"print(pairs_df.drop_duplicates(['q_std', 'q_sim']).shape)\n",
"pairs_df.head()\n",
"\n",
"pairs_df_2 = pairs_df.sort_values('label', ascending=0).drop_duplicates(['q_sim', 'q_std'])\n",
"pairs_df_final = pairs_df_2\n",
"print(pairs_df_final.shape, pairs_df.shape)\n",
"\n",
"print('对于每一个q_sim,仅保留概率最高的10条样本'.center(60, '-'))\n",
"pairs_df_final['prob'] = pairs_df_final.prob.astype(\"float\")\n",
"pairs_df_final['nrank'] = pairs_df_final.groupby(['label', 'q_sim'])['prob'].rank(ascending=0, method='first')\n",
"df_final = pairs_df_final[pairs_df_final.nrank <= 9].reset_index(drop=True)\n",
"df_final['sim_idx'] = df_final.q_sim.map(q_sim_dict)\n",
"df_final['std_idx'] = df_final.q_std.map(q_std_dict)\n",
"df_final = df_final.sort_values(['sim_idx', 'label', 'nrank'], ascending=[1, 0, 1])[['label', 'q_sim', 'q_std']].reset_index(drop=True)\n",
"\n",
"print('对于每一条标问,随机挑选一条样本作为dev集合'.center(60, '-'))\n",
"xdf['dev_rnd'] = xdf.q_std.apply(lambda v: np.random.rand())\n",
"xdf['nrank_dev'] = xdf.groupby('q_std')['dev_rnd'].rank(ascending=0, method='first')\n",
"q_sim_choose_dev = xdf[xdf.nrank_dev <= 1].drop_duplicates(['q_sim']).q_sim.tolist()\n",
"df_train = df_final.copy(deep=True)\n",
"df_dev = df_final[df_final.q_sim.isin(q_sim_choose_dev)]\n",
"print('第二阶段train集: ', sec_train_file, ', shape: ', df_train.shape)\n",
"df_train[['label', 'q_std', 'q_sim']].to_csv(sec_train_file, sep=\"\\t\", index=None, header=False)\n",
"print('第二阶段dev集: ', sec_dev_file, ', shape', df_dev.shape)\n",
"df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_test_file, sep=\"\\t\", index=None, header=False)\n",
"df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_dev_file, sep=\"\\t\", index=None, header=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step2_1.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 二阶段训练: 基于困难负样本的进一步精排
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
ContrastiveLoss
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
TensorDataset
from
tqdm
import
tqdm
from
config
import
config_path
,
checkpoint_path
,
dict_path
,
sec_train_file
,
sec_dev_file
import
numpy
as
np
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
sklearn.metrics
import
roc_auc_score
import
random
import
os
# 固定seed
seed_everything
(
42
)
maxlen
=
64
batch_size
=
64
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
tokens_ids_list
=
[[]
for
_
in
range
(
2
)]
labels
=
[]
for
text1
,
text2
,
label
in
batch
:
tokens_ids_list
[
0
].
append
(
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)[
0
])
tokens_ids_list
[
1
].
append
(
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)[
0
])
labels
.
append
(
label
)
for
i
,
token_ids
in
enumerate
(
tokens_ids_list
):
tokens_ids_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
tensor
(
labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
tokens_ids_list
,
labels
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
label
,
text1
,
text2
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
text1
.
replace
(
' '
,
''
),
text2
.
replace
(
' '
,
''
),
int
(
label
)))
return
D
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pool_cls1
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
1
-
torch
.
cosine_similarity
(
embeddings_a
,
embeddings_b
)
return
scores
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
def
encode
(
self
,
texts
):
token_ids_list
=
[]
for
text
in
texts
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids_list
.
append
(
token_ids
)
token_ids_tensor
=
torch
.
tensor
(
sequence_padding
(
token_ids_list
),
dtype
=
torch
.
long
)
valid_dataloader
=
DataLoader
(
TensorDataset
(
token_ids_tensor
),
batch_size
=
batch_size
)
valid_sen_emb
=
[]
for
token_ids
in
tqdm
(
valid_dataloader
,
desc
=
'Evaluate'
):
token_ids
=
token_ids
[
0
].
to
(
device
)
output
=
self
.
predict
(
token_ids
)
valid_sen_emb
.
append
(
output
.
cpu
())
valid_sen_emb
=
torch
.
cat
(
valid_sen_emb
,
dim
=
0
)
return
valid_sen_emb
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
ContrastiveLoss
(
margin
=
0.8
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
def
__init__
(
self
):
super
().
__init__
()
self
.
best_val_auc
=
0
def
on_dataloader_end
(
self
,
logs
=
None
):
model
.
train_dataloader
=
DataLoader
(
MyDataset
(
sec_train_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_auc
=
self
.
evaluate
(
valid_dataloader
)
if
val_auc
>=
self
.
best_val_auc
:
self
.
best_val_auc
=
val_auc
model
.
save_weights
(
'sec_best_weights.pt'
)
print
(
f
'val_auc:
{
val_auc
:.
5
f
}
, best_val_auc:
{
self
.
best_val_auc
:.
5
f
}
\n
'
)
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
):
embeddings1
.
append
(
model
.
predict
(
batch_token1_ids
).
cpu
())
embeddings2
.
append
(
model
.
predict
(
batch_token2_ids
).
cpu
())
labels
.
append
(
batch_labels
.
cpu
())
embeddings1
=
torch
.
cat
(
embeddings1
).
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
numpy
()
labels
=
torch
.
cat
(
labels
).
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
auc
=
roc_auc_score
(
labels
,
cosine_scores
)
return
auc
if
__name__
==
'__main__'
:
train_dataloader
=
DataLoader
(
MyDataset
(
sec_train_file
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
sec_dev_file
),
batch_size
=
batch_size
,
shuffle
=
False
,
collate_fn
=
collate_fn
)
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
1000
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'sec_best_weights.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step3_inference.ipynb
0 → 100644
View file @
0e29b9b7
{
"cells": [
{
"cell_type": "markdown",
"source": [
"### 获得单例文本预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"from config import *\n",
"from utils import *\n",
"\n",
"q_std_list, fst_q_std_sentence_embeddings, q_all, fst_q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"_, sec_q_std_sentence_embeddings, _, sec_q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, sec_q_std_vectors_file, q_corpus_file, sec_q_corpus_vectors_file)\n",
"\n",
"def get_fst_topK(text, K=10):\n",
" text_embedding = model1.encode([text])[0].numpy()\n",
" sims_with_std = np.array(cos_sim4matrix_2(text_embedding, fst_q_std_sentence_embeddings))\n",
" sort_idx = np.argsort(-sims_with_std)[:K]\n",
" sims_q_sort = [q_std_list[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(sims_q_sort, sims_values))\n",
" return (result)\n",
"\n",
"def get_sec_topK(self, text, K=20):\n",
" text_embedding = self.model.encode([text])[0]\n",
" sims_with_std = np.array(self.cos_sim4matrix_2(text_embedding, self.std_sentence_embeddings))\n",
" sort_idx = np.argsort(-sims_with_std)[:K]\n",
" sims_q_sort = [self.q_std_list[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(sims_q_sort, sims_values))\n",
" return (result)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import numpy as np\n",
"from config import *\n",
"from utils import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model as model1\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model as model2\n",
"\n",
"text = input()\n",
"\n",
"# 第一阶段召回\n",
"result_first = get_fst_topK(text=text)\n",
"print('第一阶段\\n', result_first[0:20])\n",
"first_intents = [v[0] for v in result_first]\n",
"\n",
"# 第二阶段召回\n",
"a_texts_embeddings_2 = np.array(model2.encode([text]))\n",
"b_texts_embeddings_2 = np.array([sec_q_all_sentence_embeddings_dict[v] for v in first_intents])\n",
"sims_with_std = cos_sim4matrix_2(a_texts_embeddings_2, b_texts_embeddings_2).reshape(-1)\n",
"sort_idx = np.argsort(-sims_with_std).tolist()\n",
"intents_sort = [first_intents[idx] for idx in sort_idx]\n",
"sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
"result_second = list(zip(intents_sort, sims_values))\n",
"print('第二阶段\\n', result_second[0:20])"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"orig_nbformat": 4,
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/task_sentence_embedding_FinanceFAQ_step3_predict.ipynb
0 → 100644
View file @
0e29b9b7
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## 计算向量,统计结果等\n",
"### 获得基于第二阶段模型所得标问向量"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model\n",
"from config import *\n",
"\n",
"# get list\n",
"q_std_list = pd.read_csv(q_std_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"q_corpus = pd.read_csv(q_corpus_file, sep=\"\\t\", names=['c']).c.tolist()\n",
"\n",
"# get embeddings\n",
"q_std_sentence_embeddings = model.encode(q_std_list)\n",
"print('保存二阶段标准问向量:', sec_q_std_vectors_file)\n",
"np.save(sec_q_std_vectors_file, q_std_sentence_embeddings)\n",
"q_corpus_sentence_embeddings = model.encode(q_corpus)\n",
"print('保存二阶段所有语料向量:', sec_q_corpus_vectors_file)\n",
"np.save(sec_q_corpus_vectors_file, q_corpus_sentence_embeddings)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"### 获得所有待测数据第一阶段模型预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from config import *\n",
"from utils import *\n",
"from task_sentence_embedding_FinanceFAQ_step1_1 import model as model1\n",
"\n",
"path_list = fst_eval_path_list\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
"\n",
"for i, input_path in enumerate(path_list):\n",
" print(f'开始评估新语料: {i}'.center(120, '='))\n",
" df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
" df_eval = df_eval[~pd.isna(df_eval.q_sim)]\n",
" output_path = input_path[:-4] + '_result.tsv'\n",
" print('input_path: ', input_path, 'output_path: ', output_path)\n",
"\n",
" print(\"目标语料数量:\", df_eval.shape, '标问数量:', df_eval.q_std.nunique(), '相似问数量:',\n",
" df_eval.q_sim.nunique(), '标语料去重后数量', df_eval.drop_duplicates([\"q_std\", \"q_sim\"]).shape[0])\n",
"\n",
" ## v1 对于都是有一个是小量的情况下\n",
" df_eval = cal_performance(model1, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=10)\n",
" df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"### 获得所有待测数据第二阶段模型预测结果"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import os\n",
"import torch\n",
"from task_sentence_embedding_FinanceFAQ_step2_1 import model as model2\n",
"import numpy as np\n",
"import pandas as pd\n",
"from config import *\n",
"\n",
"path_list = sec_eval_path_list\n",
"\n",
"# 读取q_std、q_corpus语料和向量\n",
"q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, sec_q_std_vectors_file, q_corpus_file, sec_q_corpus_vectors_file)\n",
"# 标问和向量的映射\n",
"corpus_sentence_embeddings_dict = {q_std_list[i]: q_std_sentence_embeddings[i] for i in range(0, len(q_std_list))}\n",
"\n",
"for i, input_path in enumerate(path_list):\n",
" print(f'开始评估新语料: {i}'.center(120, '='))\n",
" df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
" output_path = input_path[:-4] + '_result.tsv'\n",
" print('input_path: ', input_path, 'output_path: ', output_path)\n",
"\n",
" texts = df_eval.q_sim.tolist()\n",
" texts_in = [v for v in texts if v in q_all_sentence_embeddings_dict.keys()]\n",
" texts_out = [v for v in texts if v not in q_all_sentence_embeddings_dict.keys()]\n",
" texts_out_embeddings = model2.encode(texts_out) if texts_out else []\n",
" texts_embeddings_dict_1 = {texts_in[i]: q_all_sentence_embeddings_dict[texts_in[i]] for i in range(0, len(texts_in))}\n",
" texts_embeddings_dict_2 = {texts_out[i]: texts_out_embeddings[i] for i in range(0, len(texts_out))}\n",
" texts_embeddings_dict = {**texts_embeddings_dict_1, **texts_embeddings_dict_2}\n",
" print('目标语料编码数量:——>', len(texts_embeddings_dict))\n",
"\n",
" def get_sec_result(text, std_texts):\n",
" '''预测模型2的结果\n",
" '''\n",
" a_text_embeddings = texts_embeddings_dict[text] # 获取改相似问在模型2中的向量\n",
" b_text_embeddings = np.array([corpus_sentence_embeddings_dict[v] for v in std_texts]) # 拿到模型1召回的候选标问在模型2中的向量\n",
" sims_with_std = cos_sim4matrix_2(a_text_embeddings, b_text_embeddings).reshape(-1)\n",
" sort_idx = np.argsort(-sims_with_std).tolist()\n",
" intents_sort = [std_texts[idx] for idx in sort_idx]\n",
" sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
" result = list(zip(intents_sort, sims_values))\n",
" return (result)\n",
"\n",
" # 模型1预测结果\n",
" df_eval['q_std_pred_list_v1'] = df_eval.q_std_pred_list_v1.apply(lambda v: eval(v))\n",
"\n",
" # 模型2预测结果\n",
" df_eval['q_std_pred_list_2'] = df_eval.apply(lambda row: get_sec_result(row['q_sim'], row['q_std_pred_list_v1']), axis=1)\n",
"\n",
" df_eval['q_std_pred_list_2_v1'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[0] for k in v])\n",
" df_eval['q_std_pred_list_2_v2'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[1] for k in v])\n",
" df_eval['q_std_pred_2'] = df_eval.q_std_pred_list_2_v1.apply(lambda v: v[0])\n",
" df_eval['prob_2'] = df_eval.q_std_pred_list_2_v2.apply(lambda v: v[0])\n",
"\n",
" df_eval['r1'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:1] else 0, axis=1)\n",
" df_eval['r3'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:3] else 0, axis=1)\n",
" df_eval['r5'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:5] else 0, axis=1)\n",
" df_eval['r10'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:10] else 0, axis=1)\n",
"\n",
" # 扣除不包含的标准问\n",
" print('目标语料准确率:——>')\n",
" print(df_eval.shape)\n",
" df_1 = df_eval\n",
" print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
" df_2 = df_eval[df_eval.t10 == 1]\n",
" print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
" df_3 = df_eval\n",
" print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
"\n",
" ##扣除不包含的标准问\n",
" print('目标语料准确率[有效标问]:——>')\n",
" df_k_need = df_eval[df_eval.ifin == 1]\n",
" print(df_k_need.shape)\n",
" df_1 = df_k_need\n",
" print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
" df_2 = df_k_need[df_k_need.t10 == 1]\n",
" print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
" df_3 = df_k_need\n",
" print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
" df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.8 64-bit ('base': conda)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
},
"interpreter": {
"hash": "509cf8fb3e64af7327dbc287206db89f13b65f7dad389d82b165e29388b2e60b"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/FinanceFAQ/utils.py
0 → 100644
View file @
0e29b9b7
import
torch
from
torch
import
Tensor
import
numpy
as
np
import
pandas
as
pd
def
pytorch_cos_sim
(
a
:
Tensor
,
b
:
Tensor
):
if
not
isinstance
(
a
,
torch
.
Tensor
):
a
=
torch
.
tensor
(
a
)
if
not
isinstance
(
b
,
torch
.
Tensor
):
b
=
torch
.
tensor
(
b
)
if
len
(
a
.
shape
)
==
1
:
a
=
a
.
unsqueeze
(
0
)
if
len
(
b
.
shape
)
==
1
:
b
=
b
.
unsqueeze
(
0
)
a_norm
=
a
/
a
.
norm
(
dim
=
1
)[:,
None
]
b_norm
=
b
/
b
.
norm
(
dim
=
1
)[:,
None
]
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
def
cos_sim
(
vector_a
,
vector_b
):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a
=
np
.
mat
(
vector_a
)
vector_b
=
np
.
mat
(
vector_b
)
num
=
float
(
vector_a
*
vector_b
.
T
)
denom
=
np
.
linalg
.
norm
(
vector_a
)
*
np
.
linalg
.
norm
(
vector_b
)
cos
=
num
/
denom
sim
=
0.5
+
0.5
*
cos
return
sim
def
cos_sim_1
(
vector_a
,
vector_b
):
"""
计算两个向量之间的余弦相似度
:param vector_a: 向量 a
:param vector_b: 向量 b
:return: sim
"""
vector_a
=
np
.
mat
(
vector_a
)
vector_b
=
np
.
mat
(
vector_b
)
num
=
float
(
vector_a
*
vector_b
.
T
)
denom
=
np
.
linalg
.
norm
(
vector_a
)
*
np
.
linalg
.
norm
(
vector_b
)
cos
=
num
/
denom
return
cos
def
cos_sim4matrix
(
arr
,
brr
):
return
0.5
+
0.5
*
(
arr
.
dot
(
brr
.
T
)
/
(
np
.
sqrt
(
np
.
sum
(
arr
*
arr
))
*
np
.
sqrt
(
np
.
sum
(
brr
*
brr
,
axis
=
1
))))
def
cos_sim4matrix_2
(
arr
,
brr
):
return
(
arr
.
dot
(
brr
.
T
)
/
(
np
.
sqrt
(
np
.
sum
(
arr
*
arr
))
*
np
.
sqrt
(
np
.
sum
(
brr
*
brr
,
axis
=
1
))))
def
read_q_std_q_corpus
(
q_std_file
,
q_std_vectors_file
,
q_corpus_file
,
q_corpus_vectors_file
):
'''读取q_std、q_corpus语料和向量
'''
print
(
'读取标准问及其向量'
.
center
(
60
,
'-'
))
q_std_list
=
pd
.
read_csv
(
q_std_file
,
sep
=
"
\t
"
,
names
=
[
'c'
]).
c
.
tolist
()
q_std_sentence_embeddings
=
np
.
load
(
q_std_vectors_file
)
print
(
'标准问shape:'
,
q_std_sentence_embeddings
.
shape
,
len
(
q_std_list
))
print
(
'读取所有语料及其向量'
.
center
(
60
,
'-'
))
q_all
=
pd
.
read_csv
(
q_corpus_file
,
sep
=
"
\t
"
,
names
=
[
'c'
]).
c
.
tolist
()
q_all_sentence_embeddings
=
np
.
load
(
q_corpus_vectors_file
)
q_all_sentence_embeddings_dict
=
{
q_all
[
i
]:
q_all_sentence_embeddings
[
i
]
for
i
in
range
(
0
,
len
(
q_all
))}
print
(
'所有语料shape'
,
q_all_sentence_embeddings
.
shape
,
len
(
q_all
))
return
q_std_list
,
q_std_sentence_embeddings
,
q_all
,
q_all_sentence_embeddings_dict
def
cal_performance
(
model
,
q_all_sentence_embeddings_dict
,
q_std_sentence_embeddings
,
q_std_list
,
df_eval
,
K
=
20
):
'''计算召回topK的指标
'''
texts
=
df_eval
.
q_sim
.
tolist
()
texts_in
=
[
v
for
v
in
texts
if
v
in
q_all_sentence_embeddings_dict
.
keys
()]
texts_out
=
[
v
for
v
in
texts
if
v
not
in
q_all_sentence_embeddings_dict
.
keys
()]
texts_out_embeddings
=
model
.
encode
(
texts_out
)
if
texts_out
else
[]
texts_embeddings_dict_1
=
{
texts_in
[
i
]:
q_all_sentence_embeddings_dict
[
texts_in
[
i
]]
for
i
in
range
(
0
,
len
(
texts_in
))}
texts_embeddings_dict_2
=
{
texts_out
[
i
]:
texts_out_embeddings
[
i
]
for
i
in
range
(
0
,
len
(
texts_out
))}
texts_embeddings_dict
=
{
**
texts_embeddings_dict_1
,
**
texts_embeddings_dict_2
}
print
(
f
'计算相似度 K=
{
K
}
'
.
center
(
60
,
'-'
))
df_eval
[
'ifin'
]
=
df_eval
.
q_std
.
apply
(
lambda
v
:
1
if
v
in
q_std_list
else
0
)
print
(
"目标语料标问是否存在:——>"
,
df_eval
.
groupby
(
"ifin"
)[
"ifin"
].
count
())
print
(
'----计算所有query和q_std的相似度'
)
x_texts_embeddings
=
np
.
array
([
texts_embeddings_dict
[
x_text
]
for
x_text
in
texts
])
cos_scores
=
pytorch_cos_sim
(
x_texts_embeddings
,
q_std_sentence_embeddings
).
cpu
()
print
(
'shape: '
,
x_texts_embeddings
.
shape
,
q_std_sentence_embeddings
.
shape
,
cos_scores
.
shape
)
print
(
f
'----为每条相似问找到相似度最大的
{
K
}
条标问'
.
center
(
60
,
'-'
))
cos_scores_top_k_values
,
cos_scores_top_k_idx
=
torch
.
topk
(
cos_scores
,
K
,
dim
=
1
,
largest
=
True
,
sorted
=
False
)
cos_scores_top_k_values
=
cos_scores_top_k_values
.
tolist
()
cos_scores_top_k_idx
=
cos_scores_top_k_idx
.
tolist
()
cos_q_corpus_sort
=
[[
q_std_list
[
v
]
for
v
in
vlist
]
for
vlist
in
cos_scores_top_k_idx
]
# 最相似的TopK个标问
result
=
[
list
(
zip
(
cos_q_corpus_sort
[
i
],
cos_scores_top_k_values
[
i
]))
for
i
in
range
(
0
,
len
(
texts
))]
texts_topk_dict
=
{
texts
[
i
]:
result
[
i
]
for
i
in
range
(
0
,
len
(
texts
))}
# 拿到每个相似问的预测结果,topK的预测标问和对应的相似度
df_eval
[
'q_std_pred_list'
]
=
df_eval
.
q_sim
.
map
(
texts_topk_dict
)
# 计算q_sim和q_std之间的相似度
df_eval
[
'prob_with_std'
]
=
df_eval
.
apply
(
lambda
row
:
cos_sim_1
(
texts_embeddings_dict
[
row
[
'q_sim'
]],
q_std_sentence_embeddings
[
q_std_list
.
index
(
row
[
'q_std'
])]),
axis
=
1
)
df_eval
.
loc
[:,
'q_std_pred'
]
=
df_eval
.
q_std_pred_list
.
apply
(
lambda
v
:
v
[
0
][
0
])
df_eval
.
loc
[:,
'prob'
]
=
df_eval
.
q_std_pred_list
.
apply
(
lambda
v
:
v
[
0
][
1
])
# df_eval.loc[:,'q_std_pred_list_pair']=df_eval.apply(lambda row: [(row['q_std'],row['q_sim'],v[0],v[1]) for v in row['q_std_pred_list']],axis=1)
df_eval
[
'q_std_pred_list_v1'
]
=
df_eval
.
q_std_pred_list
.
apply
(
lambda
v
:
[
k
[
0
]
for
k
in
v
])
# 只保留预测的标准问句
df_eval
[
'q_std_pred_list_v2'
]
=
df_eval
.
q_std_pred_list
.
apply
(
lambda
v
:
[
k
[
1
]
for
k
in
v
])
# 只保留预测的概率
df_eval
[
't1'
]
=
df_eval
.
apply
(
lambda
row
:
1
if
row
[
'q_std'
]
in
row
[
'q_std_pred_list_v1'
][
0
:
1
]
else
0
,
axis
=
1
)
df_eval
[
't3'
]
=
df_eval
.
apply
(
lambda
row
:
1
if
row
[
'q_std'
]
in
row
[
'q_std_pred_list_v1'
][
0
:
3
]
else
0
,
axis
=
1
)
df_eval
[
't5'
]
=
df_eval
.
apply
(
lambda
row
:
1
if
row
[
'q_std'
]
in
row
[
'q_std_pred_list_v1'
][
0
:
5
]
else
0
,
axis
=
1
)
df_eval
[
't10'
]
=
df_eval
.
apply
(
lambda
row
:
1
if
row
[
'q_std'
]
in
row
[
'q_std_pred_list_v1'
][
0
:
10
]
else
0
,
axis
=
1
)
print
(
'----模型准确率: '
,
df_eval
.
t1
.
sum
()
/
df_eval
.
shape
[
0
],
df_eval
.
t3
.
sum
()
/
df_eval
.
shape
[
0
],
df_eval
.
t5
.
sum
()
/
df_eval
.
shape
[
0
],
df_eval
.
t10
.
sum
()
/
df_eval
.
shape
[
0
])
df_eval_need
=
df_eval
[
df_eval
.
ifin
==
1
]
print
(
'----模型准确率:[有效标问]:'
,
df_eval_need
.
t1
.
sum
()
/
df_eval_need
.
shape
[
0
],
df_eval_need
.
t3
.
sum
()
/
df_eval_need
.
shape
[
0
],
df_eval_need
.
t5
.
sum
()
/
df_eval_need
.
shape
[
0
],
df_eval_need
.
t10
.
sum
()
/
df_eval_need
.
shape
[
0
])
return
df_eval
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_DimensionalityReduction.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 利用pca压缩句向量
# 从768维压缩到128维,指标从81.82下降到80.10
from
task_sentence_embedding_sup_CosineMSELoss
import
model
,
train_dataloader
,
Model
,
device
,
valid_dataloader
,
evaluate
from
bert4torch.snippets
import
get_pool_emb
from
sklearn.decomposition
import
PCA
import
numpy
as
np
import
torch
import
torch.nn
as
nn
new_dimension
=
128
# 压缩到的维度
train_embeddings
=
[]
for
token_ids_list
,
labels
in
train_dataloader
:
for
token_ids
in
token_ids_list
:
train_embeddings
.
append
(
model
.
encode
(
token_ids
))
# if len(train_embeddings) >= 20:
# break
train_embeddings
=
torch
.
cat
(
train_embeddings
,
dim
=
0
).
cpu
().
numpy
()
print
(
'train_embeddings done, start pca training...'
)
pca
=
PCA
(
n_components
=
new_dimension
)
pca
.
fit
(
train_embeddings
)
pca_comp
=
np
.
asarray
(
pca
.
components_
)
print
(
'PCA training done...'
)
# 定义bert上的模型结构
class
NewModel
(
Model
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
dense
=
nn
.
Linear
(
768
,
new_dimension
,
bias
=
False
)
self
.
dense
.
weight
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
pca_comp
,
device
=
device
))
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pool_cls
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pool_cls
,
attention_mask
,
self
.
pool_method
)
output
=
self
.
dense
(
output
)
return
output
new_model
=
NewModel
().
to
(
device
)
new_model
.
load_weights
(
'best_model.pt'
,
strict
=
False
)
print
(
'Start evaludating...'
)
val_consine
=
evaluate
(
new_model
,
valid_dataloader
)
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
\n
'
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_model_distillation.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 模型压缩,仅保留bert-base部分层
# 初测测试指标从80%降到77%左右,未细测
from
task_sentence_embedding_sup_CosineMSELoss
import
model
,
train_dataloader
,
Model
,
device
,
valid_dataloader
,
evaluate
from
bert4torch.snippets
import
Callback
,
get_pool_emb
import
torch.optim
as
optim
import
torch.nn
as
nn
from
bert4torch.models
import
build_transformer_model
train_token_ids
,
train_embeddings
=
[],
[]
for
token_ids_list
,
labels
in
train_dataloader
:
train_token_ids
.
extend
(
token_ids_list
)
for
token_ids
in
token_ids_list
:
train_embeddings
.
append
(
model
.
encode
(
token_ids
))
# if len(train_embeddings) >= 20:
# break
new_train_dataloader
=
list
(
zip
(
train_token_ids
,
train_embeddings
))
print
(
'train_embeddings done, start model distillation...'
)
# 仅取固定的层
class
NewModel
(
Model
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
with_pool
=
True
,
segment_vocab_size
=
0
,
keep_hidden_layers
=
[
1
,
4
,
7
])
def
forward
(
self
,
token_ids
):
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
new_model
=
NewModel
().
to
(
device
)
new_model
.
compile
(
loss
=
nn
.
MSELoss
(),
optimizer
=
optim
.
Adam
(
new_model
.
parameters
(),
lr
=
2e-5
),
)
new_model
.
load_weights
(
'best_model.pt'
,
strict
=
False
)
# 加载大模型的部分层
val_consine
=
evaluate
(
new_model
,
valid_dataloader
)
print
(
'init val_cosine after distillation: '
,
val_consine
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
new_model
,
valid_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# new_model.save_weights('best_model.pt')
print
(
f
'val_consine:
{
val_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
new_model
.
fit
(
new_train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
new_model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_CoSENT.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# 原项目:https://kexue.fm/archives/8847
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text1
,
text2
,
label
in
batch
:
for
text
in
[
text1
,
text2
]:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token_ids
):
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
sem_emb
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
sem_emb
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
y_pred
,
y_true
):
# 1. 取出真实的标签
y_true
=
y_true
[::
2
]
# tensor([1, 0, 1]) 真实的标签
# 2. 对输出的句子向量进行l2归一化 后面只需要对应为相乘 就可以得到cos值了
norms
=
(
y_pred
**
2
).
sum
(
axis
=
1
,
keepdims
=
True
)
**
0.5
# y_pred = y_pred / torch.clip(norms, 1e-8, torch.inf)
y_pred
=
y_pred
/
norms
# 3. 奇偶向量相乘
y_pred
=
torch
.
sum
(
y_pred
[::
2
]
*
y_pred
[
1
::
2
],
dim
=
1
)
*
20
# 4. 取出负例-正例的差值
y_pred
=
y_pred
[:,
None
]
-
y_pred
[
None
,
:]
# 这里是算出所有位置 两两之间余弦的差值
# 矩阵中的第i行j列 表示的是第i个余弦值-第j个余弦值
y_true
=
y_true
[:,
None
]
<
y_true
[
None
,
:]
# 取出负例-正例的差值
y_true
=
y_true
.
float
()
y_pred
=
y_pred
-
(
1
-
y_true
)
*
1e12
y_pred
=
y_pred
.
view
(
-
1
)
y_pred
=
torch
.
cat
((
torch
.
tensor
([
0.0
],
device
=
device
),
y_pred
),
dim
=
0
)
# 这里加0是因为e^0 = 1相当于在log中加了1
return
torch
.
logsumexp
(
y_pred
,
dim
=
0
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
batch_token_ids
,
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings
=
model
.
predict
(
batch_token_ids
)
embeddings1
.
append
(
embeddings
[::
2
])
embeddings2
.
append
(
embeddings
[
1
::
2
])
labels
.
append
(
batch_labels
[::
2
])
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_ContrastiveLoss.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# loss: ContrastiveLoss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
from
bert4torch.losses
import
ContrastiveLoss
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
int
(
label
>
2.5
)
if
task_name
==
'STS-B'
else
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pool_cls1
=
self
.
bert
([
token1_ids
])
pool_emb1
=
get_pool_emb
(
hidden_state1
,
pool_cls1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pool_cls2
=
self
.
bert
([
token2_ids
])
pool_emb2
=
get_pool_emb
(
hidden_state2
,
pool_cls2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
distance
=
1
-
torch
.
cosine_similarity
(
pool_emb1
,
pool_emb2
)
return
distance
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
ContrastiveLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token1_ids
).
cpu
())
embeddings2
.
append
(
model
.
predict
(
batch_token2_ids
).
cpu
())
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_CosineMSELoss.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# loss: CosineMSELoss(cos + mse_loss)
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
text1
,
text2
,
label
=
l
label
=
int
(
label
)
/
5
if
task_name
==
'STS-B'
else
int
(
label
)
D
.
append
((
text1
,
text2
,
label
))
return
D
def
collate_fn
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
float
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
def
forward
(
self
,
token1_ids
,
token2_ids
):
hidden_state1
,
pooler1
=
self
.
bert
([
token1_ids
])
pool_emb1
=
get_pool_emb
(
hidden_state1
,
pooler1
,
token1_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
hidden_state2
,
pooler2
=
self
.
bert
([
token2_ids
])
pool_emb2
=
get_pool_emb
(
hidden_state2
,
pooler2
,
token2_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
torch
.
cosine_similarity
(
pool_emb1
,
pool_emb2
)
def
encode
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
attention_mask
=
token_ids
.
gt
(
0
).
long
()
output
=
get_pool_emb
(
hidden_state
,
pooler
,
attention_mask
,
self
.
pool_method
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
MSELoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
)
# 定义评价函数
def
evaluate
(
model
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
encode
(
batch_token1_ids
))
embeddings2
.
append
(
model
.
encode
(
batch_token2_ids
))
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
evaluate
(
model
,
valid_dataloader
)
test_consine
=
evaluate
(
model
,
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
5
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
bert/bert4torch_cmcc/examples/sentence_embedding/task_sentence_embedding_sup_InfoNCE.py
0 → 100644
View file @
0e29b9b7
#! -*- coding:utf-8 -*-
# loss: InfoNCE(即sentence_transformer中的MultiNegativeRankingLoss)
# 样本都是正负样本对,因此构造(正,正,负)的三元组时候,正样本对(正,正1)随机抽样负样本为(正,正1,负)
# 负样本对(正,负)重复正样本对(正,正,负)
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
get_pool_emb
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
sklearn.metrics.pairwise
import
paired_cosine_distances
from
scipy.stats
import
spearmanr
import
random
from
tqdm
import
tqdm
import
sys
# =============================基本参数=============================
# pooling, task_name = sys.argv[1:] # 传入参数
pooling
,
task_name
=
'cls'
,
'ATEC'
# debug使用
print
(
'pooling: '
,
pooling
,
' task_name: '
,
task_name
)
assert
task_name
in
[
'ATEC'
,
'BQ'
,
'LCQMC'
,
'PAWSX'
,
'STS-B'
]
assert
pooling
in
{
'first-last-avg'
,
'last-avg'
,
'cls'
,
'pooler'
}
maxlen
=
64
if
task_name
!=
'PAWSX'
else
128
batch_size
=
32
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# ===========================数据预处理===========================
# 训练
def
collate_fn
(
batch
):
texts_list
=
[[]
for
_
in
range
(
3
)]
for
texts
in
batch
:
for
i
,
text
in
enumerate
(
texts
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
texts_list
[
i
].
append
(
token_ids
)
for
i
,
texts
in
enumerate
(
texts_list
):
texts_list
[
i
]
=
torch
.
tensor
(
sequence_padding
(
texts
),
dtype
=
torch
.
long
,
device
=
device
)
labels
=
torch
.
arange
(
texts_list
[
0
].
size
(
0
),
device
=
texts_list
[
0
].
device
)
return
texts_list
,
labels
# 加载数据集
def
get_data
(
filename
):
train_data
,
all_texts
=
{},
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
!=
3
:
continue
text1
,
text2
,
label
=
l
label
=
str
(
int
(
int
(
label
)
>
2.5
))
if
task_name
==
'STS-B'
else
label
if
text1
not
in
train_data
:
train_data
[
text1
]
=
{
'0'
:
set
(),
'1'
:
set
()}
train_data
[
text1
][
label
].
add
(
text2
)
if
text2
not
in
train_data
:
train_data
[
text2
]
=
{
'0'
:
set
(),
'1'
:
set
()}
train_data
[
text2
][
label
].
add
(
text1
)
all_texts
.
extend
([
text1
,
text2
])
train_samples
=
[]
for
sent1
,
others
in
train_data
.
items
():
if
len
(
others
[
'1'
])
==
0
:
others
[
'1'
]
=
[
sent1
]
# 没有正样本,使用自身作为正阳本,这里其实就是无监督
elif
len
(
others
[
'0'
])
==
0
:
others
[
'0'
]
=
[
random
.
choice
(
all_texts
)]
# 没有负样本,随机挑选一个负样本
# sentence bert的逻辑是下面两个都加进去,这样的问题是如果shuffle=False,处于同一个batch中,相似句可能label给的负样本
if
random
.
random
()
<
0.5
:
train_samples
.
append
((
sent1
,
random
.
choice
(
list
(
others
[
'1'
])),
random
.
choice
(
list
(
others
[
'0'
]))))
else
:
train_samples
.
append
((
random
.
choice
(
list
(
others
[
'1'
])),
sent1
,
random
.
choice
(
list
(
others
[
'0'
]))))
return
train_samples
train_data
=
get_data
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
l
.
strip
().
split
(
'
\t
'
)
if
len
(
l
)
==
3
:
D
.
append
((
l
[
0
],
l
[
1
],
int
(
l
[
2
])))
return
D
def
collate_fn_eval
(
batch
):
batch_token1_ids
,
batch_token2_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token1_ids
,
_
=
tokenizer
.
encode
(
text1
,
maxlen
=
maxlen
)
batch_token1_ids
.
append
(
token1_ids
)
token2_ids
,
_
=
tokenizer
.
encode
(
text2
,
maxlen
=
maxlen
)
batch_token2_ids
.
append
(
token2_ids
)
batch_labels
.
append
([
label
])
batch_token1_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token1_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token2_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token2_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
(
batch_token1_ids
,
batch_token2_ids
),
batch_labels
.
flatten
()
# 加载数据集
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn_eval
)
# 建立模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
,
scale
=
20.0
):
super
().
__init__
()
self
.
pool_method
=
pool_method
with_pool
=
'linear'
if
pool_method
==
'pooler'
else
True
output_all_encoded_layers
=
True
if
pool_method
==
'first-last-avg'
else
False
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
with_pool
,
output_all_encoded_layers
=
output_all_encoded_layers
)
self
.
scale
=
scale
def
forward
(
self
,
token_ids_list
):
reps
=
[]
for
token_ids
in
token_ids_list
:
hidden_state1
,
pooler
=
self
.
bert
([
token_ids
])
rep
=
get_pool_emb
(
hidden_state1
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
reps
.
append
(
rep
)
embeddings_a
=
reps
[
0
]
embeddings_b
=
torch
.
cat
(
reps
[
1
:])
scores
=
self
.
cos_sim
(
embeddings_a
,
embeddings_b
)
*
self
.
scale
# [btz, btz]
return
scores
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_state
,
pooler
=
self
.
bert
([
token_ids
])
output
=
get_pool_emb
(
hidden_state
,
pooler
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
output
@
staticmethod
def
cos_sim
(
a
,
b
):
a_norm
=
torch
.
nn
.
functional
.
normalize
(
a
,
p
=
2
,
dim
=
1
)
b_norm
=
torch
.
nn
.
functional
.
normalize
(
b
,
p
=
2
,
dim
=
1
)
return
torch
.
mm
(
a_norm
,
b_norm
.
transpose
(
0
,
1
))
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_consine
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_consine
=
self
.
evaluate
(
valid_dataloader
)
test_consine
=
self
.
evaluate
(
test_dataloader
)
if
val_consine
>
self
.
best_val_consine
:
self
.
best_val_consine
=
val_consine
# model.save_weights('best_model.pt')
print
(
f
'valid_consine:
{
val_consine
:.
5
f
}
, test_consine:
{
test_consine
:.
5
f
}
, best_val_consine:
{
self
.
best_val_consine
:.
5
f
}
\n
'
)
# 重新生成dataloader,重新random选择样本
train_data
=
get_data
(
f
'F:/Projects/data/corpus/sentence_embedding/
{
task_name
}
/
{
task_name
}
.train.data'
)
model
.
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义评价函数
def
evaluate
(
self
,
data
):
embeddings1
,
embeddings2
,
labels
=
[],
[],
[]
for
(
batch_token_ids1
,
batch_token_ids2
),
batch_labels
in
tqdm
(
data
,
desc
=
'Evaluate'
):
embeddings1
.
append
(
model
.
predict
(
batch_token_ids1
))
embeddings2
.
append
(
model
.
predict
(
batch_token_ids2
))
labels
.
append
(
batch_labels
)
embeddings1
=
torch
.
cat
(
embeddings1
).
cpu
().
numpy
()
embeddings2
=
torch
.
cat
(
embeddings2
).
cpu
().
numpy
()
labels
=
torch
.
cat
(
labels
).
cpu
().
numpy
()
cosine_scores
=
1
-
(
paired_cosine_distances
(
embeddings1
,
embeddings2
))
# cosine距离是1-paired
eval_pearson_cosine
,
_
=
spearmanr
(
labels
,
cosine_scores
)
return
eval_pearson_cosine
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'best_model.pt'
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment