Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
92c75df1
Commit
92c75df1
authored
Jan 20, 2026
by
sunzhq2
Browse files
yidong infer init
parents
Changes
150
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3167 additions
and
0 deletions
+3167
-0
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_CasRel.py
...es/relation_extraction/task_relation_extraction_CasRel.py
+318
-0
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_gplinker.py
.../relation_extraction/task_relation_extraction_gplinker.py
+248
-0
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_tplinker.py
.../relation_extraction/task_relation_extraction_tplinker.py
+282
-0
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_tplinker_plus.py
...tion_extraction/task_relation_extraction_tplinker_plus.py
+257
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/README.md
.../examples/sentence_classfication/Sohu_2022_ABSA/README.md
+33
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/baseline/data_processing.ipynb
...assfication/Sohu_2022_ABSA/baseline/data_processing.ipynb
+83
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/baseline/task_sentiment_sohu.py
...ssfication/Sohu_2022_ABSA/baseline/task_sentiment_sohu.py
+289
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/top1/training.py
...es/sentence_classfication/Sohu_2022_ABSA/top1/training.py
+189
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/top1/training_bert.py
...ntence_classfication/Sohu_2022_ABSA/top1/training_bert.py
+192
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/README.md
...tence_classfication/Tianchi_News_Classification/README.md
+13
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/convert.py
...ence_classfication/Tianchi_News_Classification/convert.py
+57
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/inference.py
...ce_classfication/Tianchi_News_Classification/inference.py
+45
-0
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/training.py
...nce_classfication/Tianchi_News_Classification/training.py
+196
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentence_similarity_lcqmc.py
.../sentence_classfication/task_sentence_similarity_lcqmc.py
+113
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification.py
...s/sentence_classfication/task_sentiment_classification.py
+135
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_GAU_alpha.py
..._classfication/task_sentiment_classification_GAU_alpha.py
+124
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_PET.py
...ntence_classfication/task_sentiment_classification_PET.py
+171
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_P_tuning.py
...e_classfication/task_sentiment_classification_P_tuning.py
+181
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_albert.py
...nce_classfication/task_sentiment_classification_albert.py
+124
-0
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_electra.py
...ce_classfication/task_sentiment_classification_electra.py
+117
-0
No files found.
Too many changes to show.
To preserve performance only
150 of 150+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_CasRel.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于“半指针-半标注”结构
# 文章介绍:https://kexue.fm/archives/7161
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
import
numpy
as
np
from
bert4torch.layers
import
LayerNorm
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
,
Dataset
import
torch.optim
as
optim
import
torch.nn
as
nn
maxlen
=
128
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
collate_fn
(
batch
):
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_subject_labels
,
batch_subject_ids
,
batch_object_labels
=
[],
[],
[]
for
d
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
d
[
'text'
],
maxlen
=
maxlen
)
# 整理三元组 {s: [(o, p)]}
spoes
=
{}
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
s_idx
=
search
(
s
,
token_ids
)
o_idx
=
search
(
o
,
token_ids
)
if
s_idx
!=
-
1
and
o_idx
!=
-
1
:
s
=
(
s_idx
,
s_idx
+
len
(
s
)
-
1
)
o
=
(
o_idx
,
o_idx
+
len
(
o
)
-
1
,
p
)
if
s
not
in
spoes
:
spoes
[
s
]
=
[]
spoes
[
s
].
append
(
o
)
if
spoes
:
# subject标签
subject_labels
=
np
.
zeros
((
len
(
token_ids
),
2
))
for
s
in
spoes
:
subject_labels
[
s
[
0
],
0
]
=
1
# subject首
subject_labels
[
s
[
1
],
1
]
=
1
# subject尾
# 随机选一个subject(这里没有实现错误!这就是想要的效果!!)
# Todo: 感觉可以对未选到的subject加个mask,这样计算loss就不会计算到,可能因为模型对prob**n正例加权重导致影响不大
start
,
end
=
np
.
array
(
list
(
spoes
.
keys
())).
T
start
=
np
.
random
.
choice
(
start
)
end
=
np
.
random
.
choice
(
end
[
end
>=
start
])
subject_ids
=
(
start
,
end
)
# 对应的object标签
object_labels
=
np
.
zeros
((
len
(
token_ids
),
len
(
predicate2id
),
2
))
for
o
in
spoes
.
get
(
subject_ids
,
[]):
object_labels
[
o
[
0
],
o
[
2
],
0
]
=
1
object_labels
[
o
[
1
],
o
[
2
],
1
]
=
1
# 构建batch
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_subject_labels
.
append
(
subject_labels
)
batch_subject_ids
.
append
(
subject_ids
)
batch_object_labels
.
append
(
object_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_subject_labels
=
torch
.
tensor
(
sequence_padding
(
batch_subject_labels
),
dtype
=
torch
.
float
,
device
=
device
)
batch_subject_ids
=
torch
.
tensor
(
batch_subject_ids
,
dtype
=
torch
.
long
,
device
=
device
)
batch_object_labels
=
torch
.
tensor
(
sequence_padding
(
batch_object_labels
),
dtype
=
torch
.
float
,
device
=
device
)
batch_attention_mask
=
(
batch_token_ids
!=
tokenizer
.
_token_pad_id
)
return
[
batch_token_ids
,
batch_segment_ids
,
batch_subject_ids
],
[
batch_subject_labels
,
batch_object_labels
,
batch_attention_mask
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
)
self
.
linear1
=
nn
.
Linear
(
768
,
2
)
self
.
condLayerNorm
=
LayerNorm
(
hidden_size
=
768
,
conditional_size
=
768
*
2
)
self
.
linear2
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
2
)
@
staticmethod
def
extract_subject
(
inputs
):
"""根据subject_ids从output中取出subject的向量表征
"""
output
,
subject_ids
=
inputs
start
=
torch
.
gather
(
output
,
dim
=
1
,
index
=
subject_ids
[:,
:
1
].
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
output
.
shape
[
-
1
]))
end
=
torch
.
gather
(
output
,
dim
=
1
,
index
=
subject_ids
[:,
1
:].
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
output
.
shape
[
-
1
]))
subject
=
torch
.
cat
([
start
,
end
],
2
)
return
subject
[:,
0
]
def
forward
(
self
,
inputs
):
# 预测subject
seq_output
=
self
.
bert
(
inputs
[:
2
])
# [btz, seq_len, hdsz]
subject_preds
=
(
torch
.
sigmoid
(
self
.
linear1
(
seq_output
)))
**
2
# [btz, seq_len, 2]
# 传入subject,预测object
# 通过Conditional Layer Normalization将subject融入到object的预测中
subject_ids
=
inputs
[
2
]
# 理论上应该用LayerNorm前的,但是这样只能返回各个block顶层输出,这里和keras实现不一致
subject
=
self
.
extract_subject
([
seq_output
,
subject_ids
])
output
=
self
.
condLayerNorm
([
seq_output
,
subject
])
output
=
(
torch
.
sigmoid
(
self
.
linear2
(
output
)))
**
4
object_preds
=
output
.
reshape
(
*
output
.
shape
[:
2
],
len
(
predicate2id
),
2
)
return
[
subject_preds
,
object_preds
]
def
predict_subject
(
self
,
inputs
):
self
.
eval
()
with
torch
.
no_grad
():
seq_output
=
self
.
bert
(
inputs
[:
2
])
# [btz, seq_len, hdsz]
subject_preds
=
(
torch
.
sigmoid
(
self
.
linear1
(
seq_output
)))
**
2
# [btz, seq_len, 2]
return
[
seq_output
,
subject_preds
]
def
predict_object
(
self
,
inputs
):
self
.
eval
()
with
torch
.
no_grad
():
seq_output
,
subject_ids
=
inputs
subject
=
self
.
extract_subject
([
seq_output
,
subject_ids
])
output
=
self
.
condLayerNorm
([
seq_output
,
subject
])
output
=
(
torch
.
sigmoid
(
self
.
linear2
(
output
)))
**
4
object_preds
=
output
.
reshape
(
*
output
.
shape
[:
2
],
len
(
predicate2id
),
2
)
return
object_preds
train_model
=
Model
().
to
(
device
)
class
BCELoss
(
nn
.
BCELoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
inputs
,
targets
):
subject_preds
,
object_preds
=
inputs
subject_labels
,
object_labels
,
mask
=
targets
# sujuect部分loss
subject_loss
=
super
().
forward
(
subject_preds
,
subject_labels
)
subject_loss
=
subject_loss
.
mean
(
dim
=-
1
)
subject_loss
=
(
subject_loss
*
mask
).
sum
()
/
mask
.
sum
()
# object部分loss
object_loss
=
super
().
forward
(
object_preds
,
object_labels
)
object_loss
=
object_loss
.
mean
(
dim
=-
1
).
sum
(
dim
=-
1
)
object_loss
=
(
object_loss
*
mask
).
sum
()
/
mask
.
sum
()
return
subject_loss
+
object_loss
train_model
.
compile
(
loss
=
BCELoss
(
reduction
=
'none'
),
optimizer
=
optim
.
Adam
(
train_model
.
parameters
(),
1e-5
))
def
extract_spoes
(
text
):
"""抽取输入text所包含的三元组
"""
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
([
token_ids
],
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
tensor
([
segment_ids
],
dtype
=
torch
.
long
,
device
=
device
)
# 抽取subject
seq_output
,
subject_preds
=
train_model
.
predict_subject
([
token_ids
,
segment_ids
])
subject_preds
[:,
[
0
,
-
1
]]
*=
0
# 首cls, 尾sep置为0
start
=
torch
.
where
(
subject_preds
[
0
,
:,
0
]
>
0.6
)[
0
]
end
=
torch
.
where
(
subject_preds
[
0
,
:,
1
]
>
0.5
)[
0
]
subjects
=
[]
for
i
in
start
:
j
=
end
[
end
>=
i
]
if
len
(
j
)
>
0
:
j
=
j
[
0
]
subjects
.
append
((
i
.
item
(),
j
.
item
()))
if
subjects
:
spoes
=
[]
# token_ids = token_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
# segment_ids = segment_ids.repeat([len(subjects)]+[1]*(len(token_ids.shape)-1))
seq_output
=
seq_output
.
repeat
([
len
(
subjects
)]
+
[
1
]
*
(
len
(
seq_output
.
shape
)
-
1
))
subjects
=
torch
.
tensor
(
subjects
,
dtype
=
torch
.
long
,
device
=
device
)
# 传入subject,抽取object和predicate
object_preds
=
train_model
.
predict_object
([
seq_output
,
subjects
])
object_preds
[:,
[
0
,
-
1
]]
*=
0
for
subject
,
object_pred
in
zip
(
subjects
,
object_preds
):
start
=
torch
.
where
(
object_pred
[:,
:,
0
]
>
0.6
)
end
=
torch
.
where
(
object_pred
[:,
:,
1
]
>
0.5
)
for
_start
,
predicate1
in
zip
(
*
start
):
for
_end
,
predicate2
in
zip
(
*
end
):
if
_start
<=
_end
and
predicate1
==
predicate2
:
spoes
.
append
(
((
mapping
[
subject
[
0
]][
0
],
mapping
[
subject
[
1
]][
-
1
]),
predicate1
.
item
(),
(
mapping
[
_start
][
0
],
mapping
[
_end
][
-
1
]))
)
break
return
[(
text
[
s
[
0
]:
s
[
1
]
+
1
],
id2predicate
[
p
],
text
[
o
[
0
]:
o
[
1
]
+
1
])
for
s
,
p
,
o
,
in
spoes
]
else
:
return
[]
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])),
)
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
R
=
set
([
SPO
(
spo
)
for
spo
in
extract_spoes
(
d
[
'text'
])])
T
=
set
([
SPO
(
spo
)
for
spo
in
d
[
'spo_list'
]])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f'
%
(
f1
,
precision
,
recall
)
)
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
),
},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# optimizer.apply_ema_weights()
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# train_model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
)
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
train_model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
train_model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_gplinker.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 三元组抽取任务,基于GlobalPointer的仿TPLinker设计
# 文章介绍:https://kexue.fm/archives/8888
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
from
bert4torch.layers
import
GlobalPointer
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.losses
import
SparseMultilabelCategoricalCrossentropy
from
tqdm
import
tqdm
import
torch
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
import
numpy
as
np
maxlen
=
128
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
collate_fn
(
batch
):
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
batch_token_ids
,
batch_segment_ids
=
[],
[]
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
=
[],
[],
[]
for
d
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
d
[
'text'
],
maxlen
=
maxlen
)
# 整理三元组 {s: [(o, p)]}
spoes
=
set
()
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s
,
token_ids
)
oh
=
search
(
o
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
spoes
.
add
((
sh
,
sh
+
len
(
s
)
-
1
,
p
,
oh
,
oh
+
len
(
o
)
-
1
))
# 构建标签
entity_labels
=
[
set
()
for
_
in
range
(
2
)]
head_labels
=
[
set
()
for
_
in
range
(
len
(
predicate2id
))]
tail_labels
=
[
set
()
for
_
in
range
(
len
(
predicate2id
))]
for
sh
,
st
,
p
,
oh
,
ot
in
spoes
:
entity_labels
[
0
].
add
((
sh
,
st
))
entity_labels
[
1
].
add
((
oh
,
ot
))
head_labels
[
p
].
add
((
sh
,
oh
))
tail_labels
[
p
].
add
((
st
,
ot
))
for
label
in
entity_labels
+
head_labels
+
tail_labels
:
if
not
label
:
# 至少要有一个标签
label
.
add
((
0
,
0
))
# 如果没有则用0填充
entity_labels
=
sequence_padding
([
list
(
l
)
for
l
in
entity_labels
])
# [subject/object=2, 实体个数, 实体起终点]
head_labels
=
sequence_padding
([
list
(
l
)
for
l
in
head_labels
])
# [关系个数, 该关系下subject/object配对数, subject/object起点]
tail_labels
=
sequence_padding
([
list
(
l
)
for
l
in
tail_labels
])
# [关系个数, 该关系下subject/object配对数, subject/object终点]
# 构建batch
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_entity_labels
.
append
(
entity_labels
)
batch_head_labels
.
append
(
head_labels
)
batch_tail_labels
.
append
(
tail_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# batch_entity_labels: [btz, subject/object=2, 实体个数, 实体起终点]
# batch_head_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object起点]
# batch_tail_labels: [btz, 关系个数, 该关系下subject/object配对数, subject/object终点]
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
batch_head_labels
=
torch
.
tensor
(
sequence_padding
(
batch_head_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
batch_tail_labels
=
torch
.
tensor
(
sequence_padding
(
batch_tail_labels
,
seq_dims
=
2
),
dtype
=
torch
.
float
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
)
self
.
entity_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
2
,
head_size
=
64
)
self
.
head_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
len
(
predicate2id
),
head_size
=
64
,
RoPE
=
False
,
tril_mask
=
False
)
self
.
tail_output
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
len
(
predicate2id
),
head_size
=
64
,
RoPE
=
False
,
tril_mask
=
False
)
def
forward
(
self
,
inputs
):
hidden_states
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
mask
=
inputs
[
0
].
gt
(
0
).
long
()
entity_output
=
self
.
entity_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
head_output
=
self
.
head_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
tail_output
=
self
.
tail_output
(
hidden_states
,
mask
)
# [btz, heads, seq_len, seq_len]
return
entity_output
,
head_output
,
tail_output
model
=
Model
().
to
(
device
)
class
MyLoss
(
SparseMultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_trues
):
''' y_preds: [Tensor], shape为[btz, heads, seq_len ,seq_len]
'''
loss_list
=
[]
for
y_pred
,
y_true
in
zip
(
y_preds
,
y_trues
):
shape
=
y_pred
.
shape
# 乘以seq_len是因为(i, j)在展开到seq_len*seq_len维度对应的下标是i*seq_len+j
y_true
=
y_true
[...,
0
]
*
shape
[
2
]
+
y_true
[...,
1
]
# [btz, heads, 实体起终点的下标]
y_pred
=
y_pred
.
reshape
(
shape
[
0
],
-
1
,
np
.
prod
(
shape
[
2
:]))
# [btz, heads, seq_len*seq_len]
loss
=
super
().
forward
(
y_pred
,
y_true
.
long
())
loss
=
torch
.
mean
(
torch
.
sum
(
loss
,
dim
=
1
))
loss_list
.
append
(
loss
)
return
{
'loss'
:
sum
(
loss_list
)
/
3
,
'entity_loss'
:
loss_list
[
0
],
'head_loss'
:
loss_list
[
1
],
'tail_loss'
:
loss_list
[
2
]}
model
.
compile
(
loss
=
MyLoss
(
mask_zero
=
True
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'entity_loss'
,
'head_loss'
,
'tail_loss'
])
def
extract_spoes
(
text
,
threshold
=
0
):
"""抽取输入text所包含的三元组
"""
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
([
token_ids
],
dtype
=
torch
.
long
,
device
=
device
)
segment_ids
=
torch
.
tensor
([
segment_ids
],
dtype
=
torch
.
long
,
device
=
device
)
outputs
=
model
.
predict
([
token_ids
,
segment_ids
])
outputs
=
[
o
[
0
].
cpu
().
numpy
()
for
o
in
outputs
]
# [heads, seq_len, seq_len]
# 抽取subject和object
subjects
,
objects
=
set
(),
set
()
outputs
[
0
][:,
[
0
,
-
1
]]
-=
float
(
'inf'
)
outputs
[
0
][:,
:,
[
0
,
-
1
]]
-=
float
(
'inf'
)
for
l
,
h
,
t
in
zip
(
*
np
.
where
(
outputs
[
0
]
>
threshold
)):
if
l
==
0
:
subjects
.
add
((
h
,
t
))
else
:
objects
.
add
((
h
,
t
))
# 识别对应的predicate
spoes
=
set
()
for
sh
,
st
in
subjects
:
for
oh
,
ot
in
objects
:
p1s
=
np
.
where
(
outputs
[
1
][:,
sh
,
oh
]
>
threshold
)[
0
]
p2s
=
np
.
where
(
outputs
[
2
][:,
st
,
ot
]
>
threshold
)[
0
]
ps
=
set
(
p1s
)
&
set
(
p2s
)
for
p
in
ps
:
spoes
.
add
((
text
[
mapping
[
sh
][
0
]:
mapping
[
st
][
-
1
]
+
1
],
id2predicate
[
p
],
text
[
mapping
[
oh
][
0
]:
mapping
[
ot
][
-
1
]
+
1
]
))
return
list
(
spoes
)
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])))
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
R
=
set
([
SPO
(
spo
)
for
spo
in
extract_spoes
(
d
[
'text'
])])
T
=
set
([
SPO
(
spo
)
for
spo
in
d
[
'spo_list'
]])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f'
%
(
f1
,
precision
,
recall
))
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
)},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
# optimizer.apply_ema_weights()
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
# optimizer.reset_old_weights()
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_tplinker.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker, cat方式实体部分收敛较快,关系部分收敛较慢
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.layers
import
TplinkerHandshakingKernel
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
maxlen
=
50
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
trans_ij2k
(
seq_len
,
i
,
j
):
'''把第i行,第j列转化成上三角flat后的序号
'''
if
(
i
>
seq_len
-
1
)
or
(
j
>
seq_len
-
1
)
or
(
i
>
j
):
return
0
return
int
(
0.5
*
(
2
*
seq_len
-
i
+
1
)
*
i
+
(
j
-
i
))
map_ij2k
=
{(
i
,
j
):
trans_ij2k
(
maxlen
,
i
,
j
)
for
i
in
range
(
maxlen
)
for
j
in
range
(
maxlen
)
if
j
>=
i
}
map_k2ij
=
{
v
:
k
for
k
,
v
in
map_ij2k
.
items
()}
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
def
collate_fn
(
batch
):
pair_len
=
maxlen
*
(
maxlen
+
1
)
//
2
# batch_entity_labels: [btz, pair_len]
# batch_head_labels: [btz, rel_size, pair_len]
# batch_tail_labels: [btz, rel_size, pair_len]
batch_entity_labels
=
torch
.
zeros
((
len
(
batch
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_head_labels
=
torch
.
zeros
((
len
(
batch
),
len
(
predicate2id
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_tail_labels
=
torch
.
zeros
((
len
(
batch
),
len
(
predicate2id
),
pair_len
),
dtype
=
torch
.
long
,
device
=
device
)
batch_token_ids
=
[]
for
i
,
d
in
enumerate
(
batch
):
token_ids
=
tokenizer
.
encode
(
d
[
'text'
])[
0
][
1
:
-
1
][:
maxlen
]
# 这里要限制取前max_len个
batch_token_ids
.
append
(
token_ids
)
# 整理三元组 {s: [(o, p)]}
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
p
=
predicate2id
[
p
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s
,
token_ids
)
# 这里超过长度就会找不到
oh
=
search
(
o
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
st
,
ot
=
sh
+
len
(
s
)
-
1
,
oh
+
len
(
o
)
-
1
batch_entity_labels
[
i
,
map_ij2k
[
sh
,
st
]]
=
1
batch_entity_labels
[
i
,
map_ij2k
[
oh
,
ot
]]
=
1
if
sh
<=
oh
:
batch_head_labels
[
i
,
p
,
map_ij2k
[
sh
,
oh
]]
=
1
else
:
batch_head_labels
[
i
,
p
,
map_ij2k
[
oh
,
sh
]]
=
2
if
st
<=
ot
:
batch_tail_labels
[
i
,
p
,
map_ij2k
[
st
,
ot
]]
=
1
else
:
batch_tail_labels
[
i
,
p
,
map_ij2k
[
ot
,
st
]]
=
2
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
[
batch_entity_labels
,
batch_head_labels
,
batch_tail_labels
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
combine_fc
=
nn
.
Linear
(
768
*
2
,
768
)
self
.
ent_fc
=
nn
.
Linear
(
768
,
2
)
self
.
head_rel_fc
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
3
)
self
.
tail_rel_fc
=
nn
.
Linear
(
768
,
len
(
predicate2id
)
*
3
)
self
.
handshaking_kernel
=
TplinkerHandshakingKernel
(
768
,
shaking_type
=
'cat'
)
def
forward
(
self
,
inputs
):
last_hidden_state
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
shaking_hiddens
=
self
.
handshaking_kernel
(
last_hidden_state
)
# [btz, pair_len, hdsz]
ent_shaking_outputs
=
self
.
ent_fc
(
shaking_hiddens
)
# [btz, pair_len, 2]
btz
,
pair_len
=
shaking_hiddens
.
shape
[:
2
]
head_rel_shaking_outputs
=
self
.
head_rel_fc
(
shaking_hiddens
).
reshape
(
btz
,
-
1
,
pair_len
,
3
)
#[btz, predicate_num, pair_len, 3]
tail_rel_shaking_outputs
=
self
.
tail_rel_fc
(
shaking_hiddens
).
reshape
(
btz
,
-
1
,
pair_len
,
3
)
return
ent_shaking_outputs
,
head_rel_shaking_outputs
,
tail_rel_shaking_outputs
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_trues
):
loss_list
=
[]
for
y_pred
,
y_true
in
zip
(
y_preds
,
y_trues
):
loss
=
super
().
forward
(
y_pred
.
view
(
-
1
,
y_pred
.
size
()[
-
1
]),
y_true
.
view
(
-
1
))
loss_list
.
append
(
loss
)
z
=
(
2
*
len
(
predicate2id
)
+
1
)
total_steps
=
6000
# 前期实体识别的权重高一些,建议也可以设置为model.total_steps
w_ent
=
max
(
1
/
z
+
1
-
model
.
global_step
/
total_steps
,
1
/
z
)
w_rel
=
min
((
len
(
predicate2id
)
/
z
)
*
model
.
global_step
/
total_steps
,
(
len
(
predicate2id
)
/
z
))
loss
=
w_ent
*
loss_list
[
0
]
+
w_rel
*
loss_list
[
1
]
+
w_rel
*
loss_list
[
2
]
return
{
'loss'
:
loss
,
'entity_loss'
:
loss_list
[
0
],
'head_loss'
:
loss_list
[
1
],
'tail_loss'
:
loss_list
[
2
]}
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
5e-5
),
metrics
=
[
'entity_loss'
,
'head_loss'
,
'tail_loss'
])
def
extract_spoes
(
text
):
"""抽取输入text所包含的三元组
"""
def
get_spots_fr_shaking_tag
(
shaking_tag
):
'''解析关系
'''
spots
=
[]
for
shaking_inds
in
shaking_tag
.
nonzero
():
rel_id
=
shaking_inds
[
0
].
item
()
tag_id
=
shaking_tag
[
rel_id
][
shaking_inds
[
1
]].
item
()
matrix_inds
=
map_k2ij
[
shaking_inds
[
1
].
item
()]
# 保证前面是subject,后面是object
if
tag_id
==
1
:
spot
=
(
rel_id
,
matrix_inds
[
0
],
matrix_inds
[
1
])
elif
tag_id
==
2
:
spot
=
(
rel_id
,
matrix_inds
[
1
],
matrix_inds
[
0
])
spots
.
append
(
spot
)
return
spots
tokens
=
tokenizer
.
tokenize
(
text
)[
1
:
-
1
]
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
=
tokenizer
.
encode
(
text
)[
0
][
1
:
-
1
]
token_ids_ts
=
torch
.
tensor
(
sequence_padding
([
token_ids
],
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
outputs
=
model
.
predict
([
token_ids_ts
])
outputs
=
[
o
[
0
].
argmax
(
dim
=-
1
)
for
o
in
outputs
]
# 抽取entity
ent_matrix_spots
=
set
()
ent_text
=
set
()
for
shaking_ind
in
outputs
[
0
].
nonzero
():
shaking_ind_
=
shaking_ind
[
0
].
item
()
# tag_id = outputs[0][shaking_ind_]
matrix_inds
=
map_k2ij
[
shaking_ind_
]
spot
=
(
matrix_inds
[
0
],
matrix_inds
[
1
])
if
(
spot
[
0
]
<
len
(
mapping
))
and
(
spot
[
1
]
<
len
(
mapping
)):
# 实体起始在mapping范围内
ent_matrix_spots
.
add
(
spot
)
ent_text
.
add
(
text
[
mapping
[
spot
[
0
]][
0
]:
mapping
[
spot
[
1
]][
-
1
]
+
1
])
# 识别对应的predicate
head_rel_matrix_spots
=
get_spots_fr_shaking_tag
(
outputs
[
1
])
tail_rel_matrix_spots
=
get_spots_fr_shaking_tag
(
outputs
[
2
])
spoes
=
[]
for
rel_h
,
sh
,
oh
in
head_rel_matrix_spots
:
for
rel_t
,
st
,
ot
in
tail_rel_matrix_spots
:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if
(
rel_h
==
rel_t
)
and
((
sh
,
st
)
in
ent_matrix_spots
)
and
((
oh
,
ot
)
in
ent_matrix_spots
):
spoes
.
append
((
text
[
mapping
[
sh
][
0
]:
mapping
[
st
][
-
1
]
+
1
],
id2predicate
[
rel_h
],
text
[
mapping
[
oh
][
0
]:
mapping
[
ot
][
-
1
]
+
1
]))
return
spoes
,
token_ids
,
ent_text
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])))
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
E1
,
E2
=
0
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
spoes
,
token_ids
,
ent_text_pred
=
extract_spoes
(
d
[
'text'
])
# spo_list是用来根据maxlen删减的
spo_list
=
[]
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s_
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
o_
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s_
,
token_ids
)
# 这里超过长度就会找不到
oh
=
search
(
o_
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
spo_list
.
append
((
s
,
p
,
o
))
# 计算三元组的f1值
R
=
set
([
SPO
(
spo
)
for
spo
in
spoes
])
T
=
set
([
SPO
(
spo
)
for
spo
in
spo_list
])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
# 计算实体的指标
ent_text_truth
=
set
([
spo
[
0
]
for
spo
in
spo_list
]
+
[
spo
[
-
1
]
for
spo
in
spo_list
])
E1
+=
len
(
ent_text_pred
&
ent_text_truth
)
E2
+=
len
(
ent_text_truth
)
E_acc
=
E1
/
E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f'
%
(
f1
,
precision
,
recall
,
E_acc
))
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
)},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/relation_extraction/task_relation_extraction_tplinker_plus.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 三元组抽取任务,tplinker_plus
# 官方链接:https://github.com/131250208/TPlinker-joint-extraction
# 数据集:http://ai.baidu.com/broad/download?dataset=sked
import
json
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
TplinkerHandshakingKernel
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
import
numpy
as
np
maxlen
=
50
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载标签字典
predicate2id
,
id2predicate
=
{},
{}
with
open
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/all_50_schemas'
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
if
l
[
'predicate'
]
not
in
predicate2id
:
id2predicate
[
len
(
predicate2id
)]
=
l
[
'predicate'
]
predicate2id
[
l
[
'predicate'
]]
=
len
(
predicate2id
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:{'text': text, 'spo_list': [(s, p, o)]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
D
.
append
({
'text'
:
l
[
'text'
],
'spo_list'
:
[(
spo
[
'subject'
],
spo
[
'predicate'
],
spo
[
'object'
])
for
spo
in
l
[
'spo_list'
]]})
return
D
def
trans_ij2k
(
seq_len
,
i
,
j
):
'''把第i行,第j列转化成上三角flat后的序号
'''
if
(
i
>
seq_len
-
1
)
or
(
j
>
seq_len
-
1
)
or
(
i
>
j
):
return
0
return
int
(
0.5
*
(
2
*
seq_len
-
i
+
1
)
*
i
+
(
j
-
i
))
map_ij2k
=
{(
i
,
j
):
trans_ij2k
(
maxlen
,
i
,
j
)
for
i
in
range
(
maxlen
)
for
j
in
range
(
maxlen
)
if
j
>=
i
}
map_k2ij
=
{
v
:
k
for
k
,
v
in
map_ij2k
.
items
()}
def
tran_ent_rel2id
():
'''获取最后一个分类层的的映射关系
'''
tag2id
=
{
'ent'
:
0
}
for
p
in
predicate2id
.
keys
():
for
mode
in
[
'sh_oh'
,
'oh_sh'
,
'st_ot'
,
'ot_st'
]:
tag2id
[
p
+
'##'
+
mode
]
=
len
(
tag2id
)
return
tag2id
tag2id
=
tran_ent_rel2id
()
id2tag
=
{
v
:
k
for
k
,
v
in
tag2id
.
items
()}
def
search
(
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
def
collate_fn
(
batch
):
pair_len
=
maxlen
*
(
maxlen
+
1
)
//
2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels
=
torch
.
zeros
((
len
(
batch
),
pair_len
,
len
(
tag2id
)),
dtype
=
torch
.
long
,
device
=
device
)
batch_token_ids
=
[]
for
i
,
d
in
enumerate
(
batch
):
token_ids
=
tokenizer
.
encode
(
d
[
'text'
])[
0
][
1
:
-
1
][:
maxlen
]
# 这里要限制取前max_len个
batch_token_ids
.
append
(
token_ids
)
# 整理三元组 {s: [(o, p)]}
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
o
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s
,
token_ids
)
oh
=
search
(
o
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
st
,
ot
=
sh
+
len
(
s
)
-
1
,
oh
+
len
(
o
)
-
1
batch_labels
[
i
,
map_ij2k
[
sh
,
st
],
tag2id
[
'ent'
]]
=
1
batch_labels
[
i
,
map_ij2k
[
oh
,
ot
],
tag2id
[
'ent'
]]
=
1
if
sh
<=
oh
:
batch_labels
[
i
,
map_ij2k
[
sh
,
oh
],
tag2id
[
p
+
'##sh_oh'
]]
=
1
else
:
batch_labels
[
i
,
map_ij2k
[
oh
,
sh
],
tag2id
[
p
+
'##oh_sh'
]]
=
1
if
st
<=
ot
:
batch_labels
[
i
,
map_ij2k
[
st
,
ot
],
tag2id
[
p
+
'##st_ot'
]]
=
1
else
:
batch_labels
[
i
,
map_ij2k
[
ot
,
st
],
tag2id
[
p
+
'##ot_st'
]]
=
1
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
batch_labels
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/train_data.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/relation_extraction/BD_Knowledge_Extraction/dev_data.json'
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
handshaking_kernel
=
TplinkerHandshakingKernel
(
768
,
shaking_type
=
'cln_plus'
,
inner_enc_type
=
'lstm'
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
tag2id
))
def
forward
(
self
,
inputs
):
last_hidden_state
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
shaking_hiddens
=
self
.
handshaking_kernel
(
last_hidden_state
)
output
=
self
.
fc
(
shaking_hiddens
)
# [btz, pair_len, tag_size]
return
output
model
=
Model
().
to
(
device
)
model
.
compile
(
loss
=
MultilabelCategoricalCrossentropy
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
5e-5
))
def
extract_spoes
(
text
,
threshold
=
0
):
"""抽取输入text所包含的三元组
"""
tokens
=
tokenizer
.
tokenize
(
text
)[
1
:
-
1
]
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
token_ids
=
tokenizer
.
encode
(
text
)[
0
][
1
:
-
1
]
token_ids_
=
torch
.
tensor
(
sequence_padding
([
token_ids
],
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
outputs
=
model
.
predict
([
token_ids_
])[
0
].
cpu
().
numpy
()
# [pair_len, tag_size]
# 抽取entity, 识别对应的predicate
ent_matrix_spots
,
ent_text
=
set
(),
set
()
head_rel_matrix_spots
,
tail_rel_matrix_spots
=
[],
[]
for
shaking_ind
,
tag_id
in
zip
(
*
np
.
where
(
outputs
>
threshold
)):
matrix_inds
=
map_k2ij
[
shaking_ind
]
spot
=
(
matrix_inds
[
0
],
matrix_inds
[
1
])
if
(
spot
[
0
]
<
len
(
mapping
))
and
(
spot
[
1
]
<
len
(
mapping
)):
# 实体起始在mapping范围内
p
=
id2tag
[
tag_id
].
split
(
'##'
)[
0
]
if
id2tag
[
tag_id
]
==
'ent'
:
ent_matrix_spots
.
add
(
spot
)
ent_text
.
add
(
text
[
mapping
[
spot
[
0
]][
0
]:
mapping
[
spot
[
1
]][
-
1
]
+
1
])
else
:
p
=
predicate2id
[
p
]
if
id2tag
[
tag_id
].
endswith
(
'##sh_oh'
):
head_rel_matrix_spots
.
append
((
p
,
spot
[
0
],
spot
[
1
]))
elif
id2tag
[
tag_id
].
endswith
(
'##oh_sh'
):
head_rel_matrix_spots
.
append
((
p
,
spot
[
1
],
spot
[
0
]))
elif
id2tag
[
tag_id
].
endswith
(
'##st_ot'
):
tail_rel_matrix_spots
.
append
((
p
,
spot
[
0
],
spot
[
1
]))
elif
id2tag
[
tag_id
].
endswith
(
'##ot_st'
):
tail_rel_matrix_spots
.
append
((
p
,
spot
[
1
],
spot
[
0
]))
spoes
=
[]
for
rel_h
,
sh
,
oh
in
head_rel_matrix_spots
:
for
rel_t
,
st
,
ot
in
tail_rel_matrix_spots
:
# 如果关系相同,且(sh, st)和(oh, ot)都在entity_maxtrix_spots中
if
(
rel_h
==
rel_t
)
and
((
sh
,
st
)
in
ent_matrix_spots
)
and
((
oh
,
ot
)
in
ent_matrix_spots
):
spoes
.
append
((
text
[
mapping
[
sh
][
0
]:
mapping
[
st
][
-
1
]
+
1
],
id2predicate
[
rel_h
],
text
[
mapping
[
oh
][
0
]:
mapping
[
ot
][
-
1
]
+
1
]))
return
spoes
,
token_ids
,
ent_text
class
SPO
(
tuple
):
"""用来存三元组的类
表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
使得在判断两个三元组是否等价时容错性更好。
"""
def
__init__
(
self
,
spo
):
self
.
spox
=
(
tuple
(
tokenizer
.
tokenize
(
spo
[
0
])),
spo
[
1
],
tuple
(
tokenizer
.
tokenize
(
spo
[
2
])))
def
__hash__
(
self
):
return
self
.
spox
.
__hash__
()
def
__eq__
(
self
,
spo
):
return
self
.
spox
==
spo
.
spox
def
evaluate
(
data
):
"""评估函数,计算f1、precision、recall
"""
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
E1
,
E2
=
0
,
1e-10
f
=
open
(
'dev_pred.json'
,
'w'
,
encoding
=
'utf-8'
)
pbar
=
tqdm
()
for
d
in
data
:
spoes
,
token_ids
,
ent_text_pred
=
extract_spoes
(
d
[
'text'
])
# spo_list是用来根据maxlen删减的
spo_list
=
[]
for
s
,
p
,
o
in
d
[
'spo_list'
]:
s_
=
tokenizer
.
encode
(
s
)[
0
][
1
:
-
1
]
o_
=
tokenizer
.
encode
(
o
)[
0
][
1
:
-
1
]
sh
=
search
(
s_
,
token_ids
)
# 这里超过长度就会找不到
oh
=
search
(
o_
,
token_ids
)
if
sh
!=
-
1
and
oh
!=
-
1
:
spo_list
.
append
((
s
,
p
,
o
))
# 计算三元组的f1值
R
=
set
([
SPO
(
spo
)
for
spo
in
spoes
])
T
=
set
([
SPO
(
spo
)
for
spo
in
spo_list
])
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
# 计算实体的指标
ent_text_truth
=
set
([
spo
[
0
]
for
spo
in
spo_list
]
+
[
spo
[
-
1
]
for
spo
in
spo_list
])
E1
+=
len
(
ent_text_pred
&
ent_text_truth
)
E2
+=
len
(
ent_text_truth
)
E_acc
=
E1
/
E2
# 计算entity_matrix, head_matrix,tail_matrix的accuracy
pbar
.
update
()
pbar
.
set_description
(
'f1: %.5f, precision: %.5f, recall: %.5f, ent_acc: %.5f'
%
(
f1
,
precision
,
recall
,
E_acc
))
s
=
json
.
dumps
({
'text'
:
d
[
'text'
],
'spo_list'
:
list
(
T
),
'spo_list_pred'
:
list
(
R
),
'new'
:
list
(
R
-
T
),
'lack'
:
list
(
T
-
R
)},
ensure_ascii
=
False
,
indent
=
4
)
f
.
write
(
s
+
'
\n
'
)
pbar
.
close
()
f
.
close
()
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataset
.
data
)
if
f1
>=
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f
\n
'
%
(
f1
,
precision
,
recall
,
self
.
best_val_f1
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/README.md
0 → 100644
View file @
92c75df1
# 搜狐基于实体的情感分类
-
比赛链接:https://www.biendata.xyz/competition/sohu_2022/
| 解决方案 | 链接 | 指标 |
| ---- | ---- | ---- |
| Top1 |
[
知乎
](
https://zhuanlan.zhihu.com/p/533808475
)
| 初赛f1=0.7253, 复赛f1=0.8173 |
| baseline | —— | 初赛f1=0.6737 |
# bert4torch复现
-
预训练模型使用xlnet
-
由于比赛结束无法提交,复现只使用线下dev作为对比
-
dev为前2000,未使用方案中的后10%作为dev, dev指标略微有点不稳定
| 复现方案 | 方案 | 指标 |
| ---- | ---- | ---- |
| Top1_github | 前2000为dev, 不使用swa, 有warmup, 无label_smoothing, 无fgm, 梯度累积=3, 无rdrop | Epoch 5/10: f1=0.7697|
| Top1_bert4torch复现1 | 参数同上 | Epoch 8/10: f1=0.7556 |
| Top1_bert4torch复现2 | 参数同上+fgm+swa | Epoch 5/10: f1=0.7877 |
| Epoch | Top1_github | Top1_bert4torch复现1 | Top1_bert4torch复现2 |
| ---- | ---- | ---- | ---- |
| 1 | 0.728 | 0.7039 | 0.0274 |
| 2 | 0.7198 | 0.7327 | 0.7180 |
| 3 | 0.747 | 0.7531 | 0.7453 |
| 4 | 0.7625 | 0.7466 | 0.7594 |
| 5 |
**0.7697**
| 0.7464 |
**0.7877**
|
| 6 | 0.7638 | 0.7272 | 0.7726 |
| 7 | 0.7415 | 0.7471 | 0.7804 |
| 8 | 0.7593 |
**0.7556**
| 0.7829 |
| 9 | 0.7477 | 0.7455 | 0.7697 |
| 10 | 0.7466 | 0.7471 | 0.7620 |
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/baseline/data_processing.ipynb
0 → 100644
View file @
92c75df1
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1883\n",
"样本总量: 89195\n",
"================================样本0, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 1, \"content\": \"3.新疆棉是全球业界公认的高品质天然纤维原料,较好满足了全球范围内对棉制纺织品服装的刚性消费需求,是中国乃至全球纺织工业健康可持续发展的重要原料保障。近年来,新疆地区不仅棉花种植生产保持稳定,棉纺织及服装产业也迅速发展,为促进地区经济发展、解决各族人民就业、改善民生福祉发挥了重要作用。新疆棉花种植和棉纺产业是全球纺织供应链的重要组成部分,2021年,新疆棉产量512.9万吨,约占全球棉花产量的20%,美国政府打压新疆棉花及其制品的行为,势必严重危害全球纺织产业供应链的安全,损害全球数千万产业工人的切身利益,对此我们表示强烈反对。4.2021年1月,新疆纺织行业协会发布了详实、客观的《新疆棉纺织行业社会责任报告》,报告以详实的数据和资料充分说明中国新疆维吾尔自治区不存在所谓的“强迫劳动”。我们建议全球纺织业界各相关利益方查阅《报告》的内容和观点,尊重从事实出发的价值观,拒绝虚伪的政治操作,反对恶意造谣。我们欢迎包括美国同业在内的国际品牌、机构实地走访考察新疆棉花产区、纺织服装工厂,独立了解、判断相关事实。我们愿为相关考察和贸易投资合作提供便利与协助。\", \"entity\": {\"美国\": 0, \"中国\": 0}}\\n']\n",
"================================样本1, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 22269, \"content\": \"新华社北京8月27日电美国疾病控制和预防中心日前发布的一项研究结果显示,新冠变异病毒德尔塔毒株成为主要流行毒株后,在美获批的疫苗整体有效性降低约三分之一。研究人员分析了抗疫一线工作人员从2020年12月14日开始的疫苗接种和新冠感染情况。美国多个州的数千名抗疫一线工作人员参加了这项研究,他们每周接受核酸检测。在德尔塔毒株成为主要流行毒株期间,488名没有接种疫苗者中有19人感染,其中有症状感染者的比例为94.7%;2352名完全接种疫苗者中有24人感染,其中有症状感染者的比例为75%。现有研究没有包含感染后的病情严重程度。研究人员分析各种因素后认为,在德尔塔毒株成为主要流行毒株后,美国辉瑞、莫德纳和强生疫苗的整体有效性为66%。而先前发布的数据显示,截至2021年4月10日,这些疫苗的整体有效性为91%。据媒体报道,研究人员计划进一步分析不同疫苗的有效性,以及接种疫苗者和未接种疫苗者被感染后的症状特征等。(完)\", \"entity\": {\"毒株\": 0, \"德尔塔\": 0}}\\n']\n",
"================================样本2, train: 66896 dev: 22299 dev_type2: 471\n",
"['{\"id\": 44594, \"content\": \"民航局2022年1月21日发布的熔断指令去年底,多班自美国飞往中国的航班推迟或取消,曾引起关注。中国外交部发言人赵立坚在去年12月就达美航空赴华航班中途返航一事回应表示,近日,多班自美国飞往中国的航班推迟或取消,美国航空在距离飞机起飞仅有数小时的情况下突然宣布取消航班,达美航空的航班甚至出现航程过半后返航情况,给中国籍乘客带来巨大损失。中国驻美使领馆积极向有关乘客提供协助,并第一时间向美国有关航司提出严正交涉,敦促其保障乘客正当权益。中国外交部发言人华春莹去年8月表示,众所周知,国际定期客运航班熔断/熔控措施是降低疫情跨境传播风险的重要举措,该措施对中外航空公司一视同仁,公平公开。在中美航线上,中国国内的国航、东航等航空公司都曾熔断过,对于没有触发熔断条件的航空公司,中方从未实施该措施,因此这次美方没有理由限制中国赴美航班客座率,美方做法非常不合理。为何熔断航班激增值得注意的是,早在去年8月,美国交通部就曾要求中国的航空公司在未来四周内,将部分中国赴美航班的客座率限制在40%,当时也是对于美联航被触发“熔断”措施的回应。\", \"entity\": {\"中国\": 0, \"航班\": 0}}\\n']\n",
"================================样本3, train: 66897 dev: 22298 dev_type2: 470\n",
"['{\"id\": 66896, \"content\": \"当地时间11月5日晚,在英国伦敦的“百万面具游行”(Million Mask March)活动过程中,抗议者与警方发生冲突,致8名警察受伤,十余名抗议者被捕。 据英国《卫报》5日报道,当天夜晚,数百名抗议者聚集在英国伦敦,参加一年一度的游行。在游行过程中,参与者抗议政府越权、收入不平等,以及最近新出台的新冠疫情限制措施。 报道称,部分抗议者在游行中与警方发生冲突。伦敦警察厅表示,在伦敦各处的示威活动中,共有12人因各种违法行为被拘捕,此外,已有8名警察在与抗议者的冲突中受伤。 伦敦警察厅还在社交平台发布声明称,“有部分人在议会广场上燃放烟花和爆竹,该行为非常危险。警方为防止民众受到伤害而进入人群。” 据此前报道,“百万面具游行”活动于2011年由一个匿名黑客论坛发起,旨在以游行示威的方式反对审查制度、腐败和战争。\", \"entity\": {\"伦敦\": 0, \"英国\": 0}}\\n']\n"
]
}
],
"source": [
"from sklearn.model_selection import StratifiedKFold\n",
"import json\n",
"with open('E:/Github/Sohu2022/Sohu2022_data/nlp_data/train.txt', 'r', encoding='utf-8') as f:\n",
" train_data = f.readlines()\n",
"tag2_index = []\n",
"for line in train_data:\n",
" line = json.loads(line)\n",
" if 2 in set(line['entity'].values()):\n",
" tag2_index.append(1)\n",
" else:\n",
" tag2_index.append(0)\n",
"print(sum(tag2_index))\n",
" \n",
"print('样本总量:', len(train_data))\n",
"file_id = 0\n",
"kfold = StratifiedKFold(n_splits=4).split(train_data, tag2_index)\n",
"for i, (train_idx, dev_idx) in enumerate(kfold):\n",
" train, dev = [train_data[i] for i in train_idx], [train_data[i] for i in dev_idx]\n",
" dev_type2 = [tag2_index[i] for i in dev_idx]\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/dev_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(dev)\n",
" with open(f'E:/Github/Sohu2022/Sohu2022_data/nlp_data/train_{file_id}.txt', 'w', encoding='utf-8') as f:\n",
" f.writelines(train)\n",
" \n",
" print(f'================================样本{file_id}, train: ', len(train), 'dev: ', len(dev), 'dev_type2: ', sum(dev_type2))\n",
" print(dev[:1])\n",
" file_id += 1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/baseline/task_sentiment_sohu.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类baseline,https://www.biendata.xyz/competition/sohu_2022/
# 方案:用实体在句子中首次出现的首尾平均池化,fgm + multi_dropout + cv,f1=0.67176
import
numpy
as
np
import
random
import
json
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
text_segmentate
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.losses
import
FocalLoss
from
tqdm
import
tqdm
from
sklearn.metrics
import
f1_score
,
classification_report
,
accuracy_score
import
random
import
os
import
argparse
import
pickle
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
parser
=
argparse
.
ArgumentParser
(
description
=
'交叉验证'
)
parser
.
add_argument
(
'--fileid'
,
default
=
0
)
parser
.
add_argument
(
'--gpuid'
,
default
=
0
)
args
=
parser
.
parse_args
()
fileid
=
args
.
fileid
gpuid
=
args
.
gpuid
# 配置设置
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir
=
'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice
=
'train'
prefix
=
f
'_char_512_cv_
{
fileid
}
'
save_path
=
f
'./output/section1
{
prefix
}
.txt'
save_path_dev
=
f
'./output/dev
{
prefix
}
.txt'
ckpt_path
=
f
'./ckpt/best_model
{
prefix
}
.pt'
device
=
f
'cuda:
{
gpuid
}
'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed
=
42
# 模型设置
epochs
=
10
steps_per_epoch
=
1000
total_eval_step
=
None
maxlen
=
512
batch_size
=
7
batch_size_eval
=
64
categories
=
[
-
2
,
-
1
,
0
,
1
,
2
]
categories_count
=
{
k
+
1
:
0
for
k
in
range
(
len
(
categories
))}
# 固定seed
random
.
seed
(
seed
)
os
.
environ
[
'PYTHONHASHSEED'
]
=
str
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
# 加载数据集
class
MyDataset
(
ListDataset
):
def
__init__
(
self
,
file_path
=
None
,
data
=
None
,
mode
=
'train'
):
self
.
mode
=
mode
super
().
__init__
(
file_path
,
data
)
def
load_data
(
self
,
filename
):
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
,
desc
=
f
'[Loading
{
self
.
mode
}
data]'
):
taskData
=
json
.
loads
(
l
.
strip
())
id
=
taskData
[
'id'
]
# 按照最长长度和标点符号切分
for
t
in
text_segmentate
(
taskData
[
'content'
],
maxlen
-
2
,
seps
,
strips
):
entitys
=
[]
# train
if
isinstance
(
taskData
[
'entity'
],
dict
):
for
ent
,
label
in
taskData
[
'entity'
].
items
():
start
=
self
.
search
(
ent
,
t
)
if
start
!=
-
1
:
label
=
categories
.
index
(
label
)
+
1
entitys
.
append
((
ent
,
start
,
start
+
len
(
ent
)
-
1
,
label
))
# +1是为了padding
categories_count
[
label
]
+=
1
# test
elif
isinstance
(
taskData
[
'entity'
],
list
):
for
ent
in
taskData
[
'entity'
]:
start
=
self
.
search
(
ent
,
t
)
if
start
!=
-
1
:
entitys
.
append
((
ent
,
start
,
start
+
len
(
ent
)
-
1
,
0
))
if
entitys
:
# 如果存在实体
D
.
append
((
id
,
t
,
*
entitys
))
return
D
def
search
(
self
,
pattern
,
sequence
):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n
=
len
(
pattern
)
for
i
in
range
(
len
(
sequence
)):
if
sequence
[
i
:
i
+
n
]
==
pattern
:
return
i
return
-
1
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_extra
,
batch_token_ids
,
batch_entity_ids
,
batch_entity_labels
=
[],
[],
[],
[]
for
d
in
batch
:
id
,
contents
,
entities
=
d
[
0
],
d
[
1
],
d
[
2
:]
tokens
=
tokenizer
.
tokenize
(
contents
,
maxlen
=
maxlen
)[
1
:
-
1
]
tokens
=
[
'[CLS]'
]
+
[
j
for
i
in
tokens
for
j
in
i
]
+
[
'[SEP]'
]
# 转成char为单位的
mapping
=
tokenizer
.
rematch
(
contents
,
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
entity_ids
,
entity_labels
,
extra_map
=
[],
[],
{}
for
ent
,
start
,
end
,
label
in
entities
:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
entity_ids
.
append
([
start
,
end
])
# # 验证边界id没有问题
# if ''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])) != ent.lower():
# print(''.join(tokenizer.ids_to_tokens(token_ids[start:end+1])), ent)
entity_labels
.
append
(
label
)
extra_map
[(
start
,
end
)]
=
(
ent
,
label
)
if
not
entity_ids
:
# 至少要有一个标签
entity_ids
.
append
([
0
,
0
])
# 如果没有则用0填充
entity_labels
.
append
(
0
)
batch_extra
.
append
((
id
,
extra_map
))
batch_token_ids
.
append
(
token_ids
)
batch_entity_ids
.
append
(
entity_ids
)
batch_entity_labels
.
append
(
entity_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数,start/end]
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数]
return
[
batch_token_ids
,
batch_entity_ids
,
batch_extra
],
batch_entity_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
f
'
{
data_dir
}
/train_
{
fileid
}
.txt'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
f
'
{
data_dir
}
/dev_
{
fileid
}
.txt'
,
mode
=
'dev'
),
batch_size
=
batch_size_eval
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
f
'
{
data_dir
}
/test.txt'
,
mode
=
'test'
),
batch_size
=
batch_size_eval
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
dropout
=
[
nn
.
Dropout
(
0.1
),
nn
.
Dropout
(
0.3
),
nn
.
Dropout
(
0.5
),
nn
.
Dropout
(
0.7
)]
self
.
dense
=
nn
.
Linear
(
768
,
5
+
1
)
# 包含padding
def
forward
(
self
,
inputs
):
token_ids
,
entity_ids
=
inputs
[
0
],
inputs
[
1
]
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
btz
,
entity_count
,
_
=
entity_ids
.
shape
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
entity_ids
=
entity_ids
.
reshape
(
btz
,
-
1
,
1
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
entity_ids
).
reshape
(
btz
,
entity_count
,
-
1
,
hidden_size
)
entity_states
=
torch
.
mean
(
entity_states
,
dim
=
2
)
# 取实体首尾hidden_states的均值
entity_logits
=
[]
for
dropout
in
self
.
dropout
:
entity_logits
.
append
(
self
.
dense
(
dropout
(
entity_states
)))
return
entity_logits
model
=
Model
().
to
(
device
)
print
(
categories_count
)
class
Loss
(
nn
.
Module
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
loss_fn
=
FocalLoss
(
ignore_index
=
0
)
def
forward
(
self
,
entity_logits
,
labels
):
loss
=
0
for
entity_logit
in
entity_logits
:
loss
+=
self
.
loss_fn
(
entity_logit
.
reshape
(
-
1
,
entity_logit
.
shape
[
-
1
]),
labels
.
flatten
())
return
loss
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
1e-5
),
adversarial_train
=
{
'name'
:
'fgm'
})
def
evaluate
(
data
):
valid_true
,
valid_pred
=
[],
[]
eval_step
=
0
result
,
result_prob
=
dict
(),
dict
()
for
(
token_ids
,
entity_ids
,
extra
),
entity_labels
in
tqdm
(
data
):
entity_logit
=
model
.
predict
([
token_ids
,
entity_ids
])[
0
]
# [btz, 实体个数, 实体类别数]
entity_logit
=
F
.
softmax
(
entity_logit
,
dim
=-
1
)
entity_prob
,
entity_pred
=
torch
.
max
(
entity_logit
,
dim
=-
1
)
# [btz, 实体个数]
# v_pred和v_true是实体的预测结果,entity_tuple是(smp_id, ent_id, start, end, label, prob)的列表
v_pred
,
entity_tuple
=
trans_entity2tuple
(
entity_ids
,
entity_pred
,
entity_prob
)
v_true
,
_
=
trans_entity2tuple
(
entity_ids
,
entity_labels
)
valid_pred
.
extend
(
v_pred
)
valid_true
.
extend
(
v_true
)
# generate submit result
for
id_
,
ent_id_
,
start
,
end
,
label_
,
prob
in
entity_tuple
:
label_
=
label_
-
3
smp_id
,
s_e_ents
=
extra
[
id_
][
0
],
extra
[
id_
][
1
]
if
(
start
,
end
)
not
in
s_e_ents
:
raise
ValueError
(
'entity missing'
)
if
smp_id
not
in
result
:
result
[
smp_id
],
result_prob
[
smp_id
]
=
{},
{}
ent_name
=
s_e_ents
[(
start
,
end
)][
0
]
if
ent_name
in
result
[
smp_id
]
and
prob
<
result
[
smp_id
][
ent_name
][
-
1
]:
# 如果同一个实体
continue
else
:
result
[
smp_id
].
update
({
ent_name
:
(
label_
,
prob
)})
ent_prob
=
entity_logit
[
id_
][
ent_id_
].
cpu
().
numpy
()
result_prob
[
smp_id
].
update
({
ent_name
:
ent_prob
})
assert
prob
==
ent_prob
[
label_
+
3
]
eval_step
+=
1
if
(
total_eval_step
is
not
None
)
and
(
eval_step
>=
total_eval_step
):
break
valid_true
=
np
.
array
(
valid_true
)
valid_pred
=
np
.
array
(
valid_pred
)
f1
=
f1_score
(
valid_true
,
valid_pred
,
average
=
'macro'
)
acc
=
accuracy_score
(
valid_true
,
valid_pred
)
print
(
classification_report
(
valid_true
,
valid_pred
))
# 只保留label,不需要prob
for
k
,
v
in
result
.
items
():
result
[
k
]
=
{
i
:
j
[
0
]
for
i
,
j
in
v
.
items
()}
return
f1
,
acc
,
result
,
result_prob
def
trans_entity2tuple
(
entity_ids
,
entity_labels
,
entity_probs
=
None
):
'''把tensor转为(样本id, start, end, 实体类型, 实体概率值)的tuple用于计算指标
'''
y
,
ent_tuple
=
[],
[]
for
i
,
one_sample
in
enumerate
(
entity_ids
):
# 遍历样本
for
j
,
item
in
enumerate
(
one_sample
):
# 遍历实体
if
item
[
0
].
item
()
*
item
[
1
].
item
()
!=
0
:
tmp
=
(
i
,
j
,
item
[
0
].
item
(),
item
[
1
].
item
(),
entity_labels
[
i
,
j
].
item
())
y
.
append
(
entity_labels
[
i
,
j
].
item
())
ent_tuple
.
append
(
tmp
if
entity_probs
is
None
else
tmp
+
(
entity_probs
[
i
,
j
].
item
(),))
return
y
,
ent_tuple
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
acc
,
pred_result
,
pred_result_prob
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
model
.
save_weights
(
ckpt_path
)
# save_result(pred_result, pred_result_prob, save_path=save_path_dev)
print
(
f
'[val-entity] f1:
{
f1
:.
5
f
}
, acc:
{
acc
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
def
save_result
(
result
,
result_prob
,
save_path
):
result
=
[(
key
,
value
)
for
key
,
value
in
result
.
items
()]
result
.
sort
(
key
=
lambda
x
:
x
[
0
])
result_str
=
'id
\t
result
\n
'
for
key
,
value
in
result
:
result_str
+=
f
'
{
key
}
\t
{
value
}
\n
'
with
open
(
save_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
result_str
)
# 保存概率
with
open
(
save_path
[:
-
4
]
+
'_prob.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
result_prob
,
f
)
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
steps_per_epoch
,
callbacks
=
[
evaluator
])
model
.
load_weights
(
ckpt_path
)
f1
,
acc
,
pred_result
,
pred_result_prob
=
evaluate
(
test_dataloader
)
save_result
(
pred_result
,
pred_result_prob
,
save_path
=
save_path
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/top1/training.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import
numpy
as
np
import
json
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
text_segmentate
,
seed_everything
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
from
bert4torch.tokenizers
import
Tokenizer
,
SpTokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
import
transformers
import
random
from
sklearn.metrics
import
f1_score
,
classification_report
,
accuracy_score
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
# 配置设置
pretrain_model
=
'F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base'
config_path
=
pretrain_model
+
'/bert4torch_config.json'
checkpoint_path
=
pretrain_model
+
'/pytorch_model.bin'
data_dir
=
'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice
=
'train'
prefix
=
f
'_char_512'
save_path
=
f
'./section1
{
prefix
}
.txt'
save_path_dev
=
f
'./dev
{
prefix
}
.txt'
ckpt_path
=
f
'./best_model
{
prefix
}
.pt'
device
=
f
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
use_swa
=
False
use_adv_train
=
False
# 模型设置
epochs
=
10
steps_per_epoch
=
None
total_eval_step
=
None
num_warmup_steps
=
4000
maxlen
=
900
batch_size
=
6
batch_size_eval
=
64
grad_accumulation_steps
=
3
categories
=
[
-
2
,
-
1
,
0
,
1
,
2
]
mask_symbol
=
'<mask>'
seed_everything
(
19260817
)
# 估计随机数
# 加载数据集
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
.
readlines
(),
desc
=
"Loading data"
):
taskData
=
json
.
loads
(
l
.
strip
())
text2
=
''
.
join
([
ent
+
mask_symbol
for
ent
in
taskData
[
'entity'
].
keys
()])
D
.
append
((
taskData
[
'content'
],
text2
,
taskData
[
'entity'
]))
return
D
def
search
(
tokens
,
search_token
,
start_idx
=
0
):
mask_idxs
=
[]
for
i
in
range
(
len
(
tokens
)):
if
tokens
[
i
]
==
search_token
:
mask_idxs
.
append
(
i
+
start_idx
)
return
mask_idxs
# 建立分词器,这里使用transformer自带的
tokenizer
=
transformers
.
XLNetTokenizerFast
.
from_pretrained
(
pretrain_model
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_entity_ids
,
batch_entity_labels
=
[],
[],
[],
[]
for
text
,
prompt
,
entity
in
batch
:
inputs
=
tokenizer
.
__call__
(
text
=
text
,
text_pair
=
prompt
,
add_special_tokens
=
True
,
max_length
=
maxlen
,
truncation
=
"only_first"
)
token_ids
,
segment_ids
=
inputs
[
'input_ids'
],
inputs
[
'token_type_ids'
]
ent_ids
=
search
(
token_ids
,
tokenizer
.
mask_token_id
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_entity_ids
.
append
(
ent_ids
)
batch_entity_labels
.
append
([
categories
.
index
(
label
)
for
label
in
entity
.
values
()])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
,
value
=-
1
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数]
return
[
batch_token_ids
,
batch_segment_ids
,
batch_entity_ids
],
batch_entity_labels
# 转换数据集
all_data
=
load_data
(
f
'
{
data_dir
}
/train.txt'
)
random
.
shuffle
(
all_data
)
split_index
=
2000
# int(len(all_data)*0.9)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_data
[
split_index
:]),
batch_size
=
batch_size
,
shuffle
=
False
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_data
[:
split_index
]),
batch_size
=
batch_size_eval
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'xlnet'
)
hidden_size
=
self
.
bert
.
configs
[
'hidden_size'
]
self
.
classifier
=
nn
.
Sequential
(
nn
.
Linear
(
hidden_size
,
hidden_size
),
nn
.
LeakyReLU
(),
nn
.
Dropout
(
0.1
),
nn
.
Linear
(
hidden_size
,
5
)
)
def
forward
(
self
,
inputs
):
token_ids
,
segment_ids
,
entity_ids
=
inputs
last_hidden_state
=
self
.
bert
([
token_ids
,
segment_ids
])
# [btz, seq_len, hdsz]
entity_ids
=
entity_ids
.
unsqueeze
(
2
).
repeat
(
1
,
1
,
last_hidden_state
.
shape
[
-
1
])
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
entity_ids
)
entity_logits
=
self
.
classifier
(
entity_states
)
return
entity_logits
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
CrossEntropyLoss
):
def
forward
(
self
,
entity_logit
,
labels
):
loss
=
super
().
forward
(
entity_logit
.
reshape
(
-
1
,
entity_logit
.
shape
[
-
1
]),
labels
.
flatten
())
return
loss
optimizer
=
optim
.
AdamW
(
model
.
parameters
(),
lr
=
5e-5
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
,
num_training_steps
=
len
(
train_dataloader
)
*
epochs
,
last_epoch
=-
1
)
model
.
compile
(
loss
=
Loss
(
ignore_index
=-
1
),
optimizer
=
optimizer
,
scheduler
=
scheduler
,
clip_grad_norm
=
1.0
,
adversarial_train
=
{
'name'
:
'fgm'
if
use_adv_train
else
''
})
# swa
if
use_swa
:
def
average_function
(
ax
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
num
:
int
)
->
torch
.
Tensor
:
return
ax
+
(
x
-
ax
)
/
(
num
+
1
)
swa_model
=
torch
.
optim
.
swa_utils
.
AveragedModel
(
model
,
avg_fn
=
average_function
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
acc
,
pred_result
=
self
.
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
model
.
save_weights
(
ckpt_path
)
print
(
f
'[val-entity] f1:
{
f1
:.
5
f
}
, acc:
{
acc
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
use_swa
:
swa_model
.
update_parameters
(
model
)
@
staticmethod
def
evaluate
(
data
):
valid_true
,
valid_pred
=
[],
[]
eval_step
=
0
result
=
dict
()
for
(
token_ids
,
entity_ids
),
entity_labels
in
tqdm
(
data
):
if
use_swa
:
swa_model
.
eval
()
with
torch
.
no_grad
():
entity_logit
=
F
.
softmax
(
swa_model
([
token_ids
,
entity_ids
]),
dim
=-
1
)
# [btz, 实体个数, 实体类别数]
else
:
entity_logit
=
F
.
softmax
(
model
.
predict
([
token_ids
,
entity_ids
]),
dim
=-
1
)
# [btz, 实体个数, 实体类别数]
_
,
entity_pred
=
torch
.
max
(
entity_logit
,
dim
=-
1
)
# [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index
=
(
entity_ids
.
flatten
()
>
0
).
nonzero
().
squeeze
(
-
1
)
valid_pred
.
extend
(
entity_pred
.
flatten
()[
valid_index
].
cpu
().
tolist
())
valid_true
.
extend
(
entity_labels
.
flatten
()[
valid_index
].
cpu
().
tolist
())
eval_step
+=
1
if
(
total_eval_step
is
not
None
)
and
(
eval_step
>=
total_eval_step
):
break
valid_true
=
np
.
array
(
valid_true
)
valid_pred
=
np
.
array
(
valid_pred
)
f1
=
f1_score
(
valid_true
,
valid_pred
,
average
=
'macro'
)
acc
=
accuracy_score
(
valid_true
,
valid_pred
)
print
(
classification_report
(
valid_true
,
valid_pred
))
# 只保留label,不需要prob
for
k
,
v
in
result
.
items
():
result
[
k
]
=
{
i
:
j
[
0
]
for
i
,
j
in
v
.
items
()}
return
f1
,
acc
,
result
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
steps_per_epoch
,
grad_accumulation_steps
=
grad_accumulation_steps
,
callbacks
=
[
evaluator
])
model
.
load_weights
(
ckpt_path
)
f1
,
acc
,
pred_result
=
Evaluator
.
evaluate
(
valid_dataloader
)
bert/bert4torch_cmcc/examples/sentence_classfication/Sohu_2022_ABSA/top1/training_bert.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 搜狐2022实体情感分类Top1方案复现,https://www.biendata.xyz/competition/sohu_2022/
# 链接:https://zhuanlan.zhihu.com/p/533808475
# 复现方案:类似Prompt,拼接方案:[CLS]+sentence+[SEP]+ent1+[MASK]+ent2+[MASK]+[SEP],取[MASK]位置进行
import
numpy
as
np
import
json
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
text_segmentate
,
seed_everything
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
sklearn.metrics
import
f1_score
,
classification_report
,
accuracy_score
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
# 配置设置
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
data_dir
=
'E:/Github/Sohu2022/Sohu2022_data/nlp_data'
choice
=
'train'
prefix
=
f
'_char_512'
save_path
=
f
'./section1
{
prefix
}
.txt'
save_path_dev
=
f
'./dev
{
prefix
}
.txt'
ckpt_path
=
f
'./best_model
{
prefix
}
.pt'
device
=
f
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
use_swa
=
True
# 模型设置
epochs
=
10
steps_per_epoch
=
None
total_eval_step
=
None
num_warmup_steps
=
4000
maxlen
=
512
batch_size
=
7
batch_size_eval
=
64
categories
=
[
-
2
,
-
1
,
0
,
1
,
2
]
seed_everything
(
42
)
# 估计随机数
# 加载数据集
def
load_data
(
filename
):
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
tqdm
(
f
.
readlines
(),
desc
=
"Loading data"
):
taskData
=
json
.
loads
(
l
.
strip
())
text2
=
''
.
join
([
ent
+
'[MASK]'
for
ent
in
taskData
[
'entity'
].
keys
()])
+
'[SEP]'
text2_len
=
sum
([
len
(
ent
)
+
1
for
ent
in
taskData
[
'entity'
].
keys
()])
+
1
for
t
in
text_segmentate
(
taskData
[
'content'
],
maxlen
-
text2_len
-
2
,
seps
,
strips
):
D
.
append
((
t
,
text2
,
taskData
[
'entity'
]))
return
D
def
search
(
tokens
,
start_idx
=
0
):
mask_idxs
=
[]
for
i
in
range
(
len
(
tokens
)):
if
tokens
[
i
]
==
'[MASK]'
:
mask_idxs
.
append
(
i
+
start_idx
)
return
mask_idxs
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_entity_ids
,
batch_entity_labels
=
[],
[],
[]
for
text1
,
text2
,
entity
in
batch
:
token_ids1
=
tokenizer
.
encode
(
text1
)[
0
]
tokens2
=
tokenizer
.
tokenize
(
text2
)[
1
:
-
1
]
token_ids2
=
tokenizer
.
tokens_to_ids
(
tokens2
)
ent_ids_raw
=
search
(
tokens2
,
start_idx
=
len
(
token_ids1
))
# 不在原文中的实体,其[MASK]标记不用于计算loss
ent_labels
,
ent_ids
=
[],
[]
for
i
,
(
ent
,
label
)
in
enumerate
(
entity
.
items
()):
if
ent
in
text1
:
assert
tokens2
[
ent_ids_raw
[
i
]
-
len
(
token_ids1
)]
==
'[MASK]'
ent_ids
.
append
(
ent_ids_raw
[
i
])
ent_labels
.
append
(
categories
.
index
(
label
))
batch_token_ids
.
append
(
token_ids1
+
token_ids2
)
batch_entity_ids
.
append
(
ent_ids
)
batch_entity_labels
.
append
(
ent_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
,
value
=-
1
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数]
return
[
batch_token_ids
,
batch_entity_ids
],
batch_entity_labels
# 转换数据集
all_data
=
load_data
(
f
'
{
data_dir
}
/train.txt'
)
split_index
=
int
(
len
(
all_data
)
*
0.9
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_data
[:
split_index
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
all_data
[
split_index
:]),
batch_size
=
batch_size_eval
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
hidden_size
=
self
.
bert
.
configs
[
'hidden_size'
]
self
.
classifier
=
nn
.
Sequential
(
nn
.
Linear
(
hidden_size
,
hidden_size
),
nn
.
LeakyReLU
(),
nn
.
Dropout
(
0.1
),
nn
.
Linear
(
hidden_size
,
5
)
)
def
forward
(
self
,
inputs
):
token_ids
,
entity_ids
=
inputs
[
0
],
inputs
[
1
]
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
entity_ids
=
entity_ids
.
unsqueeze
(
2
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
entity_ids
)
entity_logits
=
self
.
classifier
(
entity_states
)
return
entity_logits
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
CrossEntropyLoss
):
def
forward
(
self
,
entity_logit
,
labels
):
loss
=
super
().
forward
(
entity_logit
.
reshape
(
-
1
,
entity_logit
.
shape
[
-
1
]),
labels
.
flatten
())
return
loss
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
,
num_training_steps
=
len
(
train_dataloader
)
*
epochs
,
last_epoch
=-
1
)
model
.
compile
(
loss
=
Loss
(
ignore_index
=-
1
),
optimizer
=
optimizer
,
scheduler
=
scheduler
,
adversarial_train
=
{
'name'
:
'fgm'
})
# swa
if
use_swa
:
def
average_function
(
ax
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
num
:
int
)
->
torch
.
Tensor
:
return
ax
+
(
x
-
ax
)
/
(
num
+
1
)
swa_model
=
torch
.
optim
.
swa_utils
.
AveragedModel
(
model
,
avg_fn
=
average_function
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
acc
,
pred_result
=
self
.
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
model
.
save_weights
(
ckpt_path
)
print
(
f
'[val-entity] f1:
{
f1
:.
5
f
}
, acc:
{
acc
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
use_swa
:
swa_model
.
update_parameters
(
model
)
@
staticmethod
def
evaluate
(
data
):
valid_true
,
valid_pred
=
[],
[]
eval_step
=
0
result
=
dict
()
for
(
token_ids
,
entity_ids
),
entity_labels
in
tqdm
(
data
):
if
use_swa
:
swa_model
.
eval
()
with
torch
.
no_grad
():
entity_logit
=
F
.
softmax
(
swa_model
([
token_ids
,
entity_ids
]),
dim
=-
1
)
# [btz, 实体个数, 实体类别数]
else
:
entity_logit
=
F
.
softmax
(
model
.
predict
([
token_ids
,
entity_ids
]),
dim
=-
1
)
# [btz, 实体个数, 实体类别数]
_
,
entity_pred
=
torch
.
max
(
entity_logit
,
dim
=-
1
)
# [btz, 实体个数]
# v_pred和v_true是实体的预测结果
valid_index
=
(
entity_ids
.
flatten
()
>
0
).
nonzero
().
squeeze
(
-
1
)
valid_pred
.
extend
(
entity_pred
.
flatten
()[
valid_index
].
cpu
().
tolist
())
valid_true
.
extend
(
entity_labels
.
flatten
()[
valid_index
].
cpu
().
tolist
())
eval_step
+=
1
if
(
total_eval_step
is
not
None
)
and
(
eval_step
>=
total_eval_step
):
break
valid_true
=
np
.
array
(
valid_true
)
valid_pred
=
np
.
array
(
valid_pred
)
f1
=
f1_score
(
valid_true
,
valid_pred
,
average
=
'macro'
)
acc
=
accuracy_score
(
valid_true
,
valid_pred
)
print
(
classification_report
(
valid_true
,
valid_pred
))
# 只保留label,不需要prob
for
k
,
v
in
result
.
items
():
result
[
k
]
=
{
i
:
j
[
0
]
for
i
,
j
in
v
.
items
()}
return
f1
,
acc
,
result
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
steps_per_epoch
,
callbacks
=
[
evaluator
])
model
.
load_weights
(
ckpt_path
)
f1
,
acc
,
pred_result
=
Evaluator
.
evaluate
(
valid_dataloader
)
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/README.md
0 → 100644
View file @
92c75df1
# 天池新闻分类
比赛链接:https://tianchi.aliyun.com/competition/entrance/531810/introduction?lang=zh-cn
| 解决方案 | 说明 | 指标 |
| ---- | ---- | ---- |
| Top1 |
[
Github
](
https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
)
| 正式赛f1=0.9735 |
| Top1复现 | bert以第1折交叉epoch=5初始化,1个epoch,seed=0, 1993, 2020三者融合 | 长期赛f1=0.9736 |
| Top1_bert4torch复现 | bert+attn+fgm+cv | 长期赛f1=0.9727, dev_5cv=(0.97083, 0.97074, 0.96914, 0.96892, 0.96613)|
## 文件说明
-
convert.py: 将上述链接中的tensorflow权重转为pytorch的
-
training.py: finetune训练代码
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/convert.py
0 → 100644
View file @
92c75df1
import
torch
import
tensorflow
as
tf
tf_path
=
'E:/Github/天池新闻分类/top1/pre_models/bert_model.ckpt'
torch_state_dict
=
{}
mapping
=
{
'bert/embeddings/word_embeddings'
:
'bert.embeddings.word_embeddings.weight'
,
'bert/embeddings/token_type_embeddings'
:
'bert.embeddings.token_type_embeddings.weight'
,
'bert/embeddings/position_embeddings'
:
'bert.embeddings.position_embeddings.weight'
,
'bert/embeddings/LayerNorm/beta'
:
'bert.embeddings.LayerNorm.bias'
,
'bert/embeddings/LayerNorm/gamma'
:
'bert.embeddings.LayerNorm.weight'
,
# 'bert/pooler/dense/kernel': 'bert.pooler.dense.weight',
# 'bert/pooler/dense/bias': 'bert.pooler.dense.bias',
# 'cls/seq_relationship/output_weights': 'cls.seq_relationship.weight',
# 'cls/seq_relationship/output_bias': 'cls.seq_relationship.bias',
'cls/predictions/transform/dense/kernel'
:
'cls.predictions.transform.dense.weight##T'
,
'cls/predictions/transform/dense/bias'
:
'cls.predictions.transform.dense.bias'
,
'cls/predictions/transform/LayerNorm/beta'
:
'cls.predictions.transform.LayerNorm.bias'
,
'cls/predictions/transform/LayerNorm/gamma'
:
'cls.predictions.transform.LayerNorm.weight'
,
'cls/predictions/output_bias'
:
'cls.predictions.bias'
,
}
for
i
in
range
(
12
):
prefix
=
'bert/encoder/layer_%d/'
%
i
prefix_i
=
f
'bert.encoder.layer.%d.'
%
i
mapping
.
update
({
prefix
+
'attention/self/query/kernel'
:
prefix_i
+
'attention.self.query.weight##T'
,
prefix
+
'attention/self/query/bias'
:
prefix_i
+
'attention.self.query.bias'
,
prefix
+
'attention/self/key/kernel'
:
prefix_i
+
'attention.self.key.weight##T'
,
prefix
+
'attention/self/key/bias'
:
prefix_i
+
'attention.self.key.bias'
,
prefix
+
'attention/self/value/kernel'
:
prefix_i
+
'attention.self.value.weight##T'
,
prefix
+
'attention/self/value/bias'
:
prefix_i
+
'attention.self.value.bias'
,
prefix
+
'attention/output/dense/kernel'
:
prefix_i
+
'attention.output.dense.weight##T'
,
prefix
+
'attention/output/dense/bias'
:
prefix_i
+
'attention.output.dense.bias'
,
prefix
+
'attention/output/LayerNorm/beta'
:
prefix_i
+
'attention.output.LayerNorm.bias'
,
prefix
+
'attention/output/LayerNorm/gamma'
:
prefix_i
+
'attention.output.LayerNorm.weight'
,
prefix
+
'intermediate/dense/kernel'
:
prefix_i
+
'intermediate.dense.weight##T'
,
prefix
+
'intermediate/dense/bias'
:
prefix_i
+
'intermediate.dense.bias'
,
prefix
+
'output/dense/kernel'
:
prefix_i
+
'output.dense.weight##T'
,
prefix
+
'output/dense/bias'
:
prefix_i
+
'output.dense.bias'
,
prefix
+
'output/LayerNorm/beta'
:
prefix_i
+
'output.LayerNorm.bias'
,
prefix
+
'output/LayerNorm/gamma'
:
prefix_i
+
'output.LayerNorm.weight'
,
})
for
old_key
,
new_key
in
mapping
.
items
():
try
:
ts
=
tf
.
train
.
load_variable
(
tf_path
,
old_key
)
if
new_key
.
endswith
(
'##T'
):
torch_state_dict
[
new_key
.
rstrip
(
'##T'
)]
=
torch
.
from_numpy
(
ts
).
T
else
:
torch_state_dict
[
new_key
]
=
torch
.
from_numpy
(
ts
)
except
:
print
(
'Missing '
,
old_key
)
torch
.
save
(
torch_state_dict
,
'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin'
)
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/inference.py
0 → 100644
View file @
92c75df1
# 模型推理脚本
# cv逐一预测,按照dev的指标加权
from
copyreg
import
pickle
from
torch
import
device
from
training
import
Model
,
collate_fn
import
torch
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
ListDataset
import
pandas
as
pd
from
tqdm
import
tqdm
import
numpy
as
np
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
batch_size
=
16
def
load_data
(
df
):
"""加载数据。"""
D
=
list
()
for
_
,
row
in
df
.
iterrows
():
text
=
row
[
'text'
]
D
.
append
((
text
,
0
))
return
D
df_test
=
pd
.
read_csv
(
'E:/Github/天池新闻分类/data/test_a.csv'
,
sep
=
'
\t
'
)
df_test
[
'text'
]
=
df_test
[
'text'
].
apply
(
lambda
x
:
x
.
strip
().
split
())
test_data
=
load_data
(
df_test
)
dev_dataloader
=
DataLoader
(
ListDataset
(
data
=
test_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
f1_score
=
[
0.97083
,
0.97074
,
0.96914
,
0.96892
,
0.96613
]
y_pred_final
=
0
for
i
in
range
(
5
):
model
=
Model
().
to
(
device
)
model
.
load_weights
(
f
'best_model_fold
{
i
+
1
}
.pt'
)
y_pred
=
[]
for
x
,
_
in
tqdm
(
dev_dataloader
,
desc
=
f
'evaluate_cv
{
i
}
'
):
y_pred
.
append
(
model
.
predict
(
x
).
cpu
().
numpy
())
# if len(y_pred) > 10:
# break
y_pred
=
np
.
concatenate
(
y_pred
)
y_pred_final
+=
y_pred
*
f1_score
[
i
]
np
.
save
(
f
'test_cv
{
i
}
_logit.npy'
,
y_pred
)
df_test
=
pd
.
DataFrame
(
y_pred_final
.
argmax
(
axis
=
1
))
df_test
.
columns
=
[
'label'
]
df_test
.
to_csv
(
'submission.csv'
,
index
=
False
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/sentence_classfication/Tianchi_News_Classification/training.py
0 → 100644
View file @
92c75df1
# 模型训练脚本
# 链接:https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
# 这里仅基于bert4torch实现了Top1解决方案中的finetune部分,直接使用了原作者的预训练权重转pytorch
import
numpy
as
np
import
pandas
as
pd
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
torch.utils.data
import
DataLoader
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
Callback
,
EarlyStopping
from
bert4torch.tokenizers
import
Tokenizer
import
torch.nn.functional
as
F
from
sklearn.metrics
import
f1_score
from
sklearn.model_selection
import
StratifiedKFold
import
torch
from
torch
import
nn
,
optim
from
tqdm
import
tqdm
# BERT base
config_path
=
'E:/Github/天池新闻分类/top1/pre_models/bert_config.json'
checkpoint_path
=
'E:/Github/天池新闻分类/top1/pre_models/pytorch_model.bin'
dict_path
=
'E:/Github/天池新闻分类/top1/pre_models/vocab.txt'
device
=
f
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
n
=
5
# Cross-validation
SEED
=
2020
num_classes
=
14
maxlen
=
512
max_segment
=
2
batch_size
=
4
grad_accum_steps
=
64
drop
=
0.2
lr
=
2e-5
epochs
=
100
def
load_data
(
df
):
"""加载数据。"""
D
=
list
()
for
_
,
row
in
df
.
iterrows
():
text
=
row
[
'text'
]
label
=
row
[
'label'
]
D
.
append
((
text
,
int
(
label
)))
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
sentence_split
(
words
):
"""句子截断。"""
document_len
=
len
(
words
)
index
=
list
(
range
(
0
,
document_len
,
maxlen
-
2
))
index
.
append
(
document_len
)
segments
=
[]
for
i
in
range
(
len
(
index
)
-
1
):
segment
=
words
[
index
[
i
]:
index
[
i
+
1
]]
assert
len
(
segment
)
>
0
segment
=
tokenizer
.
tokens_to_ids
([
'[CLS]'
]
+
segment
+
[
'[SEP]'
])
segments
.
append
(
segment
)
assert
len
(
segments
)
>
0
if
len
(
segments
)
>
max_segment
:
segment_
=
int
(
max_segment
/
2
)
return
segments
[:
segment_
]
+
segments
[
-
segment_
:]
else
:
return
segments
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
=
sentence_split
(
text
)
token_ids
=
sequence_padding
(
token_ids
,
length
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
label
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
max_segment
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
device
=
device
)
return
batch_token_ids
,
batch_labels
class
Attention
(
nn
.
Module
):
"""注意力层。"""
def
__init__
(
self
,
hidden_size
,
**
kwargs
):
self
.
hidden_size
=
hidden_size
super
().
__init__
(
**
kwargs
)
self
.
weight
=
nn
.
Linear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
False
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
self
.
hidden_size
))
self
.
query
=
nn
.
Linear
(
self
.
hidden_size
,
1
,
bias
=
False
)
def
forward
(
self
,
x
,
mask
):
'''x: [btz, max_segment, hdsz]
mask: [btz, max_segment, 1]
'''
mask
=
mask
.
squeeze
(
2
)
# [btz, max_segment]
# linear
key
=
self
.
weight
(
x
)
+
self
.
bias
# [btz, max_segment, hdsz]
# compute attention
outputs
=
self
.
query
(
key
).
squeeze
(
2
)
# [btz, max_segment]
outputs
-=
1e32
*
(
1
-
mask
)
attn_scores
=
F
.
softmax
(
outputs
,
dim
=-
1
)
attn_scores
=
attn_scores
*
mask
attn_scores
=
attn_scores
.
reshape
(
-
1
,
1
,
attn_scores
.
shape
[
-
1
])
# [btz, 1, max_segment]
outputs
=
torch
.
matmul
(
attn_scores
,
key
).
squeeze
(
1
)
# [btz, hdsz]
return
outputs
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
dropout1
=
nn
.
Dropout
(
0.1
)
self
.
dropout2
=
nn
.
Dropout
(
0.1
)
self
.
attn
=
Attention
(
768
)
self
.
dense
=
nn
.
Linear
(
768
,
num_classes
)
def
forward
(
self
,
token_ids
):
''' token_ids: [btz, max_segment, max_len]
'''
input_mask
=
torch
.
any
(
token_ids
,
dim
=-
1
,
keepdim
=
True
).
long
()
# [btz, max_segment, 1]
token_ids
=
token_ids
.
reshape
(
-
1
,
token_ids
.
shape
[
-
1
])
# [btz*max_segment, max_len]
output
=
self
.
bert
([
token_ids
])[:,
0
]
# [btz*max_segment, hdsz]
output
=
output
.
reshape
((
-
1
,
max_segment
,
output
.
shape
[
-
1
]))
# [btz, max_segment, hdsz]
output
=
output
*
input_mask
output
=
self
.
dropout1
(
output
)
output
=
self
.
attn
(
output
,
input_mask
)
output
=
self
.
dropout2
(
output
)
output
=
self
.
dense
(
output
)
return
output
class
Evaluator
(
Callback
):
def
__init__
(
self
,
model
,
dataloader
,
fold
):
super
().
__init__
()
self
.
model
=
model
self
.
dataloader
=
dataloader
self
.
best_val_f1
=
0.
self
.
fold
=
fold
def
evaluate
(
self
):
y_true
,
y_pred
=
list
(),
list
()
for
x
,
y
in
tqdm
(
self
.
dataloader
,
desc
=
'evaluate'
):
y_true
.
append
(
y
.
cpu
().
numpy
())
y_pred
.
append
(
self
.
model
.
predict
(
x
).
argmax
(
axis
=
1
).
cpu
().
numpy
())
y_true
=
np
.
concatenate
(
y_true
)
y_pred
=
np
.
concatenate
(
y_pred
)
f1
=
f1_score
(
y_true
,
y_pred
,
average
=
'macro'
)
return
f1
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
val_f1
=
self
.
evaluate
()
if
val_f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
val_f1
self
.
model
.
save_weights
(
f
'best_model_fold
{
self
.
fold
}
.pt'
)
logs
[
'val_f1'
]
=
val_f1
# 这个要设置,否则EarlyStopping不生效
print
(
f
'val_f1:
{
val_f1
:.
5
f
}
, best_val_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
def
do_train
(
df_train
):
skf
=
StratifiedKFold
(
n_splits
=
n
,
random_state
=
SEED
,
shuffle
=
True
)
for
fold
,
(
trn_idx
,
val_idx
)
in
enumerate
(
skf
.
split
(
df_train
[
'text'
],
df_train
[
'label'
]),
1
):
print
(
f
'[Fold
{
fold
}
]'
)
train_data
=
load_data
(
df_train
.
iloc
[
trn_idx
])
valid_data
=
load_data
(
df_train
.
iloc
[
val_idx
])
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
valid_data
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
model
=
Model
().
to
(
device
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
lr
),
adversarial_train
=
{
'name'
:
'fgm'
})
callbacks
=
[
Evaluator
(
model
,
valid_dataloader
,
fold
),
EarlyStopping
(
monitor
=
'val_f1'
,
patience
=
5
,
verbose
=
1
,
mode
=
'max'
),
# 需要在Evaluator后面
]
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
None
,
epochs
=
epochs
,
grad_accumulation_steps
=
grad_accum_steps
,
callbacks
=
callbacks
)
del
model
if
__name__
==
'__main__'
:
df_train
=
pd
.
read_csv
(
'E:/Github/天池新闻分类/data/train_set.csv'
,
sep
=
'
\t
'
)
df_train
[
'text'
]
=
df_train
[
'text'
].
apply
(
lambda
x
:
x
.
strip
().
split
())
do_train
(
df_train
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentence_similarity_lcqmc.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 句子对分类任务,LCQMC数据集
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
tensorboardX
import
SummaryWriter
maxlen
=
128
batch_size
=
64
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
writer
=
SummaryWriter
(
log_dir
=
'./summary'
)
# prepare summary writer
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本1, 文本2, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text1
,
text2
,
label
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
text1
,
text2
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text1
,
text2
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text1
,
text2
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
(
batch_token_ids
,
batch_segment_ids
),
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_embedding/LCQMC/LCQMC.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
_
,
pooled_output
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_batch_end
(
self
,
global_step
,
local_step
,
logs
=
None
):
if
global_step
%
10
==
0
:
writer
.
add_scalar
(
f
"train/loss"
,
logs
[
'loss'
],
global_step
)
val_acc
=
evaluate
(
valid_dataloader
)
writer
.
add_scalar
(
f
"valid/acc"
,
val_acc
,
global_step
)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# valid_acc: 94.72, test_acc: 94.11
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
tensorboardX
import
SummaryWriter
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
writer
=
SummaryWriter
(
log_dir
=
'./summary'
)
# prepare summary writer
choice
=
'train'
# train表示训练,infer表示推理
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
,
segment_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
def
inference
(
texts
):
'''单条样本推理
'''
for
text
in
texts
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
,
device
=
device
)[
None
,
:]
segment_ids
=
torch
.
tensor
(
segment_ids
,
dtype
=
torch
.
long
,
device
=
device
)[
None
,
:]
logit
=
model
.
predict
([
token_ids
,
segment_ids
])
y_pred
=
torch
.
argmax
(
torch
.
softmax
(
logit
,
dim
=-
1
)).
cpu
().
numpy
()
print
(
text
,
' ----> '
,
y_pred
)
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
inference
([
'我今天特别开心'
,
'我今天特别生气'
])
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_GAU_alpha.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分类任务,加载GAU-alpha权重
# 博客:https://kexue.fm/archives/9052
# 权重转换脚本:./convert_script/convert_GAU_alpha.py
# valid_acc: 95.25, test_acc: 94.46
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
random
import
os
import
numpy
as
np
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'gau_alpha'
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
last_hidden_state
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
last_hidden_state
[:,
0
,
:])
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_PET.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分析例子,利用MLM做 Zero-Shot/Few-Shot/Semi-Supervised Learning
# 参考项目:https://github.com/bojone/Pattern-Exploiting-Training
# 指标如下,由于没有固定随机化因子,因此下述指标可能略有波动
# zero-shot1: 0.8517/0.8437
# zero-shot2: 0.8811/0.8707
# few-shot: 0.8896/0.8910
# semi-sup: 0.9024/0.8948
import
torch
import
torch.nn
as
nn
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
from
torch.optim
import
Adam
import
torch.nn.functional
as
F
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
Callback
from
torch.utils.data
import
DataLoader
num_classes
=
2
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
choice
=
'semi-sup'
# zero-shot1, zero-shot2, few-shot, semi-sup
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
text
,
int
(
label
)))
return
D
# 加载数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
)
test_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
)
# 模拟标注和非标注数据
train_frac
=
0.01
# 标注数据的比例
num_labeled
=
int
(
len
(
train_data
)
*
train_frac
)
unlabeled_data
=
[(
t
,
2
)
for
t
,
l
in
train_data
[
num_labeled
:]]
if
choice
==
'zero-shot2'
:
train_data
=
unlabeled_data
# 仅使用无监督数据继续mlm预训练
elif
choice
==
'few-shot'
:
train_data
=
train_data
[:
num_labeled
]
# 仅使用少量监督数据
elif
choice
==
'semi-sup'
:
# 少量监督数据和全量无监督数据做半监督
train_data
=
train_data
[:
num_labeled
]
train_data
=
train_data
+
unlabeled_data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 对应的任务描述
prefix
=
u
'很满意。'
mask_idx
=
1
pos_id
=
tokenizer
.
token_to_id
(
u
'很'
)
neg_id
=
tokenizer
.
token_to_id
(
u
'不'
)
def
random_masking
(
token_ids
):
"""对输入进行随机mask
"""
rands
=
np
.
random
.
random
(
len
(
token_ids
))
source
,
target
=
[],
[]
for
r
,
t
in
zip
(
rands
,
token_ids
):
if
r
<
0.15
*
0.8
:
source
.
append
(
tokenizer
.
_token_mask_id
)
target
.
append
(
t
)
elif
r
<
0.15
*
0.9
:
source
.
append
(
t
)
target
.
append
(
t
)
elif
r
<
0.15
:
source
.
append
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
)
+
1
)
target
.
append
(
t
)
else
:
source
.
append
(
t
)
target
.
append
(
0
)
return
source
,
target
class
MyDataset
(
ListDataset
):
def
collate_fn
(
self
,
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_output_ids
=
[],
[],
[]
for
text
,
label
in
batch
:
if
label
!=
2
:
text
=
prefix
+
text
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
if
self
.
kwargs
[
'random'
]:
source_ids
,
target_ids
=
random_masking
(
token_ids
)
else
:
source_ids
,
target_ids
=
token_ids
[:],
token_ids
[:]
if
label
==
0
:
source_ids
[
mask_idx
]
=
tokenizer
.
_token_mask_id
target_ids
[
mask_idx
]
=
neg_id
elif
label
==
1
:
source_ids
[
mask_idx
]
=
tokenizer
.
_token_mask_id
target_ids
[
mask_idx
]
=
pos_id
batch_token_ids
.
append
(
source_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_output_ids
.
append
(
target_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_output_ids
=
torch
.
tensor
(
sequence_padding
(
batch_output_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_output_ids
# 加载数据集
train_dataset
=
MyDataset
(
data
=
train_data
,
random
=
True
)
valid_dataset
=
MyDataset
(
data
=
valid_data
,
random
=
False
)
test_dataset
=
MyDataset
(
data
=
test_data
,
random
=
False
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
train_dataset
.
collate_fn
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
valid_dataset
.
collate_fn
)
test_dataloader
=
DataLoader
(
test_dataset
,
batch_size
=
batch_size
,
collate_fn
=
test_dataset
.
collate_fn
)
# 加载预训练模型
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_mlm
=
True
).
to
(
device
)
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_true
):
y_pred
=
y_preds
[
1
]
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
loss
=
super
().
forward
(
y_pred
,
y_true
.
flatten
())
return
loss
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
MyLoss
(
ignore_index
=
0
),
optimizer
=
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'[
{
choice
}
] valid_acc:
{
val_acc
:.
4
f
}
, test_acc:
{
test_acc
:.
4
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
4
f
}
\n
'
)
@
staticmethod
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
F
.
softmax
(
model
.
predict
(
x_true
)[
1
],
dim
=-
1
)
y_pred
=
y_pred
[:,
mask_idx
,
[
neg_id
,
pos_id
]].
argmax
(
axis
=
1
)
y_true
=
(
y_true
[:,
mask_idx
]
==
pos_id
).
long
()
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
if
choice
==
'zero-shot1'
:
valid_acc
=
evaluator
.
evaluate
(
valid_dataloader
)
test_acc
=
evaluator
.
evaluate
(
test_dataloader
)
print
(
f
'[
{
choice
}
] valid_acc:
{
valid_acc
:.
4
f
}
, test_acc:
{
test_acc
:.
4
f
}
'
)
else
:
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_P_tuning.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分析例子,利用MLM+P-tuning,目前示例是全部一起finetune未冻结
# 官方项目:https://github.com/THUDM/P-tuning
# 参考项目:https://github.com/bojone/P-tuning
# few-shot: 0.8953/0.8953
import
torch
import
torch.nn
as
nn
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
torch.optim
import
Adam
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
Callback
from
torch.utils.data
import
DataLoader
from
torchinfo
import
summary
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/robert/[hit_torch_base]--chinese-roberta-wwm-ext-base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
choice
=
'finetune_all'
# finetune_all finetune_few
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
text
,
int
(
label
)))
return
D
# 加载数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
)
test_data
=
load_data
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
)
# 模拟标注和非标注数据
train_frac
=
0.01
# 标注数据的比例
num_labeled
=
int
(
len
(
train_data
)
*
train_frac
)
unlabeled_data
=
[(
t
,
2
)
for
t
,
l
in
train_data
[
num_labeled
:]]
train_data
=
train_data
[:
num_labeled
]
# train_data = train_data + unlabeled_data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 对应的任务描述
mask_idx
=
5
desc
=
[
'[unused%s]'
%
i
for
i
in
range
(
1
,
9
)]
desc
.
insert
(
mask_idx
-
1
,
'[MASK]'
)
desc_ids
=
[
tokenizer
.
token_to_id
(
t
)
for
t
in
desc
]
pos_id
=
tokenizer
.
token_to_id
(
u
'很'
)
neg_id
=
tokenizer
.
token_to_id
(
u
'不'
)
def
random_masking
(
token_ids
):
"""对输入进行随机mask
"""
rands
=
np
.
random
.
random
(
len
(
token_ids
))
source
,
target
=
[],
[]
for
r
,
t
in
zip
(
rands
,
token_ids
):
if
r
<
0.15
*
0.8
:
source
.
append
(
tokenizer
.
_token_mask_id
)
target
.
append
(
t
)
elif
r
<
0.15
*
0.9
:
source
.
append
(
t
)
target
.
append
(
t
)
elif
r
<
0.15
:
source
.
append
(
np
.
random
.
choice
(
tokenizer
.
_vocab_size
-
1
)
+
1
)
target
.
append
(
t
)
else
:
source
.
append
(
t
)
target
.
append
(
0
)
return
source
,
target
class
MyDataset
(
ListDataset
):
def
collate_fn
(
self
,
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_output_ids
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
if
label
!=
2
:
token_ids
=
token_ids
[:
1
]
+
desc_ids
+
token_ids
[
1
:]
segment_ids
=
[
0
]
*
len
(
desc_ids
)
+
segment_ids
if
self
.
kwargs
[
'random'
]:
source_ids
,
target_ids
=
random_masking
(
token_ids
)
else
:
source_ids
,
target_ids
=
token_ids
[:],
token_ids
[:]
if
label
==
0
:
source_ids
[
mask_idx
]
=
tokenizer
.
_token_mask_id
target_ids
[
mask_idx
]
=
neg_id
elif
label
==
1
:
source_ids
[
mask_idx
]
=
tokenizer
.
_token_mask_id
target_ids
[
mask_idx
]
=
pos_id
batch_token_ids
.
append
(
source_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_output_ids
.
append
(
target_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_output_ids
=
torch
.
tensor
(
sequence_padding
(
batch_output_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_output_ids
# 加载数据集
train_dataset
=
MyDataset
(
data
=
train_data
,
random
=
True
)
valid_dataset
=
MyDataset
(
data
=
valid_data
,
random
=
False
)
test_dataset
=
MyDataset
(
data
=
test_data
,
random
=
False
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
train_dataset
.
collate_fn
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
valid_dataset
.
collate_fn
)
test_dataloader
=
DataLoader
(
test_dataset
,
batch_size
=
batch_size
,
collate_fn
=
test_dataset
.
collate_fn
)
class
MyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_preds
,
y_true
):
y_pred
=
y_preds
[
1
]
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
loss
=
super
().
forward
(
y_pred
,
y_true
.
flatten
())
return
loss
if
choice
==
'finetune_few'
:
# 只训练这几个tokens权重这部分尚未调试好
class
PtuningBERT
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_mlm
=
True
,
tie_emb_prj_weight
=
True
,
custom_attention_mask
=
True
)
for
name
,
param
in
self
.
bert
.
named_parameters
():
if
(
'word_embeddings'
not
in
name
)
and
(
'mlmDecoder'
not
in
name
):
param
.
requires_grad
=
False
# 冻结除了word_embedding层意外的其他层
def
forward
(
self
,
token_ids
,
segment_ids
):
embedding
=
self
.
bert
.
embeddings
.
word_embeddings
(
token_ids
)
embedding_no_grad
=
embedding
.
detach
()
mask
=
torch
.
ones
(
token_ids
.
shape
[
1
],
dtype
=
torch
.
long
,
device
=
token_ids
.
device
)
mask
[
1
:
9
]
-=
1
# 只优化id为1~8的token
embedding
[:,
mask
.
bool
()]
=
embedding_no_grad
[:,
mask
.
bool
()]
attention_mask
=
(
token_ids
!=
tokenizer
.
_token_pad_id
)
return
self
.
bert
([
embedding
,
segment_ids
,
attention_mask
])
model
=
PtuningBERT
().
to
(
device
)
summary
(
model
,
input_data
=
next
(
iter
(
train_dataloader
))[
0
])
elif
choice
==
'finetune_all'
:
# 全部权重一起训练
model
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_mlm
=
True
).
to
(
device
)
summary
(
model
,
input_data
=
[
next
(
iter
(
train_dataloader
))[
0
]])
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
MyLoss
(
ignore_index
=
0
),
optimizer
=
Adam
(
filter
(
lambda
p
:
p
.
requires_grad
,
model
.
parameters
()),
lr
=
6e-4
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'valid_acc:
{
val_acc
:.
4
f
}
, test_acc:
{
test_acc
:.
4
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
4
f
}
\n
'
)
@
staticmethod
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
)[
1
]
y_pred
=
y_pred
[:,
mask_idx
,
[
neg_id
,
pos_id
]].
argmax
(
axis
=
1
)
y_true
=
(
y_true
[:,
mask_idx
]
==
pos_id
).
long
()
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_albert.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分类例子,加载albert_zh权重(https://github.com/brightmart/albert_zh)
# valid_acc: 94.46, test_acc: 93.98
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
tensorboardX
import
SummaryWriter
import
random
import
os
import
numpy
as
np
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/albert_config_small_google.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/albert/[brightmart_tf_small]--albert_small_zh_google/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
writer
=
SummaryWriter
(
log_dir
=
'./summary'
)
# prepare summary writer
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(文本, 标签id)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
D
.
append
((
text
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'albert'
,
with_pool
=
True
)
# 建立模型,加载权重
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
,
segment_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
# def on_batch_end(self, global_step, local_step, logs=None):
# if global_step % 10 == 0:
# writer.add_scalar(f"train/loss", logs['loss'], global_step)
# val_acc = evaluate(valid_dataloader)
# writer.add_scalar(f"valid/acc", val_acc, global_step)
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sentence_classfication/task_sentiment_classification_electra.py
0 → 100644
View file @
92c75df1
#! -*- coding:utf-8 -*-
# 情感分类例子,加electra权重
# valid_acc: 94.94, test_acc: 94.78
import
numpy
as
np
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
random
import
os
import
numpy
as
np
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/electra/[hit_torch_base]--chinese-electra-base-discriminator/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
# 指定好model和对应的ckpt地址
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
model
=
'electra'
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
768
,
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
hidden_states
[:,
0
,
:])
# 用hidden_states的首位,即[CLS]后接dense层
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
test_acc
=
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment