Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
infer-yidong
Commits
c007ba1a
Commit
c007ba1a
authored
Apr 03, 2026
by
sunzhq2
Committed by
xuxo
Apr 03, 2026
Browse files
update
parents
Pipeline
#3464
failed with stages
in 0 seconds
Changes
258
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3171 additions
and
0 deletions
+3171
-0
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_simbert.py
.../bert4torch_cmcc/examples/seq2seq/task_seq2seq_simbert.py
+253
-0
bert/bert4torch_cmcc/examples/sequence_labeling/ana.sh
bert/bert4torch_cmcc/examples/sequence_labeling/ana.sh
+9
-0
bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py
...rt4torch_cmcc/examples/sequence_labeling/bert_migraphx.py
+324
-0
bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py.bak
...orch_cmcc/examples/sequence_labeling/bert_migraphx.py.bak
+214
-0
bert/bert4torch_cmcc/examples/sequence_labeling/bert_postprocess.py
...torch_cmcc/examples/sequence_labeling/bert_postprocess.py
+63
-0
bert/bert4torch_cmcc/examples/sequence_labeling/bert_to_onnx.py
...ert4torch_cmcc/examples/sequence_labeling/bert_to_onnx.py
+182
-0
bert/bert4torch_cmcc/examples/sequence_labeling/bertbase_postprocess.py
...h_cmcc/examples/sequence_labeling/bertbase_postprocess.py
+227
-0
bert/bert4torch_cmcc/examples/sequence_labeling/onnx_convert.sh
...ert4torch_cmcc/examples/sequence_labeling/onnx_convert.sh
+16
-0
bert/bert4torch_cmcc/examples/sequence_labeling/onnx_inference.sh
...t4torch_cmcc/examples/sequence_labeling/onnx_inference.sh
+32
-0
bert/bert4torch_cmcc/examples/sequence_labeling/outpath
bert/bert4torch_cmcc/examples/sequence_labeling/outpath
+14
-0
bert/bert4torch_cmcc/examples/sequence_labeling/output
bert/bert4torch_cmcc/examples/sequence_labeling/output
+14
-0
bert/bert4torch_cmcc/examples/sequence_labeling/post.sh
bert/bert4torch_cmcc/examples/sequence_labeling/post.sh
+4
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_W2NER.py
...les/sequence_labeling/task_sequence_labeling_ner_W2NER.py
+484
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_cascade_crf.py
...quence_labeling/task_sequence_labeling_ner_cascade_crf.py
+231
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf.py
...mples/sequence_labeling/task_sequence_labeling_ner_crf.py
+196
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_add_posseg.py
...nce_labeling/task_sequence_labeling_ner_crf_add_posseg.py
+199
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_freeze.py
...equence_labeling/task_sequence_labeling_ner_crf_freeze.py
+209
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_inference.py
...ence_labeling/task_sequence_labeling_ner_crf_inference.py
+200
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_efficient_global_pointer.py
...ng/task_sequence_labeling_ner_efficient_global_pointer.py
+149
-0
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_global_pointer.py
...nce_labeling/task_sequence_labeling_ner_global_pointer.py
+151
-0
No files found.
Too many changes to show.
To preserve performance only
258 of 258+
files are displayed.
Plain diff
Email patch
bert/bert4torch_cmcc/examples/seq2seq/task_seq2seq_simbert.py
0 → 100644
View file @
c007ba1a
#! -*- coding: utf-8 -*-
# SimBERT预训练代码,也可用于微调,微调方式用其他方式比如sentence_bert的可能更好
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
AutoRegressiveDecoder
,
Callback
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
# 基本信息
maxlen
=
32
batch_size
=
32
# 这里加载的是simbert权重,在此基础上用自己的数据继续pretrain/finetune
# 自己从头预训练也可以直接加载bert/roberta等checkpoint
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
text
,
synonym
=
synonyms
[:
2
]
text
,
synonym
=
truncate
(
text
),
truncate
(
synonym
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
synonym
,
maxlen
=
maxlen
*
2
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
synonym
,
text
,
maxlen
=
maxlen
*
2
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
'linear'
,
with_mlm
=
'linear'
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
):
y_true
=
self
.
get_labels_of_similarity
(
y_pred
)
# 构建标签
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
similarities
=
similarities
-
torch
.
eye
(
y_pred
.
shape
[
0
],
device
=
device
)
*
1e12
# 排除对角线
similarities
=
similarities
*
30
# scale
loss
=
F
.
cross_entropy
(
similarities
,
y_true
)
return
loss
def
get_labels_of_similarity
(
self
,
y_pred
):
idxs
=
torch
.
arange
(
0
,
y_pred
.
shape
[
0
],
device
=
device
)
idxs_1
=
idxs
[
None
,
:]
idxs_2
=
(
idxs
+
1
-
idxs
%
2
*
2
)[:,
None
]
labels
=
idxs_1
.
eq
(
idxs_2
).
float
()
return
labels
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
choice
=
'similarity'
# train generate similarity
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
elif
choice
==
'generate'
:
print
(
gen_synonyms
(
'我想去北京玩玩可以吗'
,
10
,
10
))
elif
choice
==
'similarity'
:
target_text
=
'我想去首都北京玩玩'
text_list
=
[
'我想去北京玩'
,
'北京有啥好玩的吗?我想去看看'
,
'好渴望去北京游玩啊'
]
Z
=
cal_sen_emb
([
target_text
]
+
text_list
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
similarity
=
torch
.
matmul
(
Z
[
1
:],
Z
[
0
])
for
i
,
line
in
enumerate
(
text_list
):
print
(
f
'cos_sim:
{
similarity
[
i
].
item
():.
4
f
}
, tgt_text: "
{
target_text
}
", cal_text: "
{
line
}
"'
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/ana.sh
0 → 100644
View file @
c007ba1a
maxtime
=
`
cat
result_
*
.log |grep
"^total_inf"
|awk
'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'
`
#inftime=`cat result_*.log |grep "^total_inf" |awk '{s+=$2}END{print s, s/NR}'`
inffps
=
`
cat
result_
*
.log |grep
"^avg_infer_fps"
|awk
'{s+=$2}END{print s, s/NR}'
`
loadtime
=
`
cat
result_
*
.log |grep
"^load_data_total"
|awk
'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'
`
loadfps
=
`
cat
result_
*
.log |grep
"^load_data_avg"
|awk
'{s+=$2}END{print s, s/NR}'
`
echo
"max infer time:
$maxtime
"
echo
"Average infer fps:
$inffps
"
echo
"max load time:
$loadtime
"
echo
"Average load fps:
$loadfps
"
bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
from
apex.optimizers
import
FusedLAMB
import
apex_C
from
apex
import
amp
import
migraphx
import
torch.nn
as
nn
import
torch.optim
as
optim
# from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
bert4torch.models
import
BaseModelDDP
import
os
import
time
import
multiprocessing
as
mp
from
multiprocessing
import
Process
,
Queue
,
Manager
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
maxlen
=
256
batch_size
=
64
config_path
=
'/datasets/bert-base-chinese/config.json'
dict_path
=
'/datasets/bert-base-chinese/vocab.txt'
gpuid
=
os
.
getenv
(
'HIP_VISIBLE_DEVICES'
)
labdir
=
os
.
path
.
join
(
'results'
,
gpuid
,
'label'
)
resultdir
=
os
.
path
.
join
(
'results'
,
gpuid
,
'data'
)
os
.
makedirs
(
resultdir
,
exist_ok
=
True
)
os
.
makedirs
(
labdir
,
exist_ok
=
True
)
def
AllocateOutputMemory
(
model
):
outputData
=
{}
for
key
in
model
.
get_outputs
().
keys
():
outputData
[
key
]
=
migraphx
.
allocate_gpu
(
s
=
model
.
get_outputs
()[
key
])
return
outputData
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
if
os
.
path
.
isfile
(
"/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/models/bert_best_mha_md5.mxr"
):
print
(
"***********load mxr model******************"
)
model
=
migraphx
.
load
(
"/home/sunzhq/workspace/yidong-infer/bert/bert4torch_cmcc/examples/sequence_labeling/models/bert_best_mha_md5.mxr"
)
else
:
print
(
"***********load onnx model******************"
)
# 加载模型
maxInput
=
{
"input"
:[
64
,
256
]}
model
=
migraphx
.
parse_onnx
(
"/models/bert_best.onnx"
,
map_input_dims
=
maxInput
)
migraphx
.
quantize_fp16
(
model
)
# 编译
model
.
compile
(
migraphx
.
get_target
(
"gpu"
),
offload_copy
=
False
,
device_id
=
0
)
inputName
=
list
(
model
.
get_inputs
().
keys
())[
0
]
modelData
=
AllocateOutputMemory
(
model
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
maxlen
=
256
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
# 截断到 maxlen
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
# 初始化 labels 为全 0(或根据你的设计,可能是 'O' 标签)
labels
=
np
.
zeros
(
len
(
token_ids
),
dtype
=
np
.
int64
)
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start_idx
=
start_mapping
[
start
]
end_idx
=
end_mapping
[
end
]
labels
[
start_idx
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start_idx
+
1
:
end_idx
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
maxlen
,
value
=
tokenizer
.
_token_pad_id
),
dtype
=
torch
.
long
,
device
=
"cuda:0"
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
,
length
=
maxlen
,
value
=
0
),
dtype
=
torch
.
long
,
device
=
"cuda:0"
)
return
batch_token_ids
,
batch_labels
# 转换数据集
valid_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
def
pad_data
(
data
,
seq
=
256
):
if
len
(
data
.
shape
)
==
1
:
return
np
.
pad
(
data
,
((
0
,
seq
-
data
.
shape
[
0
])),
"constant"
,
constant_values
=
(
0
))
elif
len
(
data
.
shape
)
==
2
:
return
np
.
pad
(
data
,
((
0
,
0
),
(
0
,
seq
-
data
.
shape
[
1
])),
"constant"
,
constant_values
=
(
0
))
else
:
# shape(bs, seq, len(categories))
return
np
.
pad
(
data
,
((
0
,
0
),
(
0
,
seq
-
data
.
shape
[
1
]),
(
0
,
0
)),
"constant"
,
constant_values
=
(
0
))
def
pad_data_bin
(
data
,
output
,
bs
,
seq
=
256
,
len_catagory
=
7
):
if
output
==
"emission_score"
:
data
=
data
.
reshape
((
bs
,
-
1
,
len_catagory
))
else
:
data
=
data
.
reshape
((
bs
,
-
1
))
return
pad_data
(
data
,
seq
)
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
end
=
0
infer_times
=
[]
total_infer_times
=
[]
data_idx
=
0
total_start
=
time
.
time
()
# warmup
for
token_ids
,
label
in
tqdm
(
data
):
data_numpy
=
token_ids
.
detach
().
cpu
().
numpy
()
img_data
=
np
.
zeros
(
data_numpy
.
shape
).
astype
(
"int64"
)
for
i
in
range
(
data_numpy
.
shape
[
0
]):
img_data
[
i
,
:]
=
data_numpy
[
i
,
:]
modelData
[
inputName
]
=
migraphx
.
to_gpu
(
migraphx
.
argument
(
img_data
))
preds_dcu
=
model
.
run
(
modelData
)
break
for
token_ids
,
label
in
tqdm
(
data
):
data_numpy
=
token_ids
.
detach
().
cpu
().
numpy
()
# device = torch.device("cuda")
# 注意:这里需要执行赋值操作,否则会造成migraphx中输入数据步长不对
img_data
=
np
.
zeros
(
data_numpy
.
shape
).
astype
(
"int64"
)
for
i
in
range
(
data_numpy
.
shape
[
0
]):
img_data
[
i
,
:]
=
data_numpy
[
i
,
:]
if
img_data
.
shape
[
0
]
!=
64
:
break
modelData
[
inputName
]
=
migraphx
.
to_gpu
(
migraphx
.
argument
(
img_data
))
start
=
time
.
time
()
# result = model.run({"input":img_data})
preds_dcu
=
model
.
run
(
modelData
)
end
+=
time
.
time
()
-
start
infer_times
.
append
(
time
.
time
()
-
start
)
print
(
f
"****infer time:
{
infer_times
[
-
1
]
}
s***** fps:
{
64
/
infer_times
[
-
1
]
}
*********"
)
total_infer_times
.
append
(
time
.
time
()
-
total_start
)
result_1
=
np
.
array
(
migraphx
.
from_gpu
(
preds_dcu
[
0
]))
result_2
=
np
.
array
(
migraphx
.
from_gpu
(
preds_dcu
[
1
]))
emission_score
=
torch
.
from_numpy
(
np
.
array
(
result_1
,
copy
=
False
))
attention_mask
=
torch
.
from_numpy
(
np
.
array
(
result_2
,
copy
=
False
))
labels
=
label
.
cpu
().
numpy
()
# emission_score = torch.from_numpy(np.array(result[0], copy=False))
# attention_mask = torch.from_numpy(np.array(result[1], copy=False))
# 保存bin文件
labels
=
np
.
pad
(
labels
,
((
0
,
batch_size
-
labels
.
shape
[
0
]),
(
0
,
0
)),
'constant'
,
constant_values
=-
1
)
labels
.
tofile
(
f
'
{
labdir
}
/
{
data_idx
}
.bin'
)
emission_score
=
np
.
pad
(
emission_score
,
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
),
(
0
,
0
)),
'constant'
)
attention_mask
=
np
.
pad
(
attention_mask
,
((
0
,
batch_size
-
attention_mask
.
shape
[
0
]),
(
0
,
0
)),
'constant'
)
emission_score
.
tofile
(
f
'
{
resultdir
}
/
{
data_idx
}
_0.bin'
)
attention_mask
.
tofile
(
f
'
{
resultdir
}
/
{
data_idx
}
_1.bin'
)
labels
=
pad_data_bin
(
labels
,
"labels"
,
batch_size
)
emission_score
=
pad_data_bin
(
emission_score
,
"emission_score"
,
batch_size
)
attention_mask
=
pad_data_bin
(
attention_mask
,
"attention_mask"
,
batch_size
)
labels
=
torch
.
Tensor
(
labels
)
# mask last data
data_mask
=
labels
[:,
0
]
>=
0
labels
=
labels
[
data_mask
]
emission_score
=
torch
.
Tensor
(
emission_score
)[
data_mask
]
attention_mask
=
torch
.
Tensor
(
attention_mask
)[
data_mask
]
scores
=
crf
.
decode
(
emission_score
,
attention_mask
)
true_label
=
[]
for
label
in
labels
:
true_label
+=
[
categories_id2label
[
int
(
l
)]
for
l
in
label
if
l
!=
0
]
attention_mask
=
labels
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
labels
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
labels
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
labels
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
data_idx
+=
1
total_start
=
time
.
time
()
print
(
"total_sample_data:"
,
(
64
*
data_idx
))
avg_infer_fps
=
64
*
len
(
infer_times
)
/
sum
(
infer_times
)
print
(
f
"total_infer_time:
{
end
}
s"
)
print
(
f
'avg_infer_fps:
{
avg_infer_fps
}
samples/s'
)
load_data_infer_time
=
sum
(
total_infer_times
)
load_data_avg_infer_fps
=
len
(
total_infer_times
)
*
64
/
sum
(
total_infer_times
)
print
(
f
'load_data_total_infer_time:
{
load_data_infer_time
}
s'
)
print
(
f
'load_data_avg_total_Infer_fps:
{
load_data_avg_infer_fps
}
samples/s'
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
# 跳过 padding / ignore_index (-100)
if
item
==
0
:
continue
# 安全地获取标签名(确保 key 是 int)
tag_id
=
int
(
item
.
item
())
# 转为 int,避免 float key
flag_tag
=
categories_id2label
[
tag_id
]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
ent
in
entity_ids
:
if
ent
:
# 非空才加入
batch_entity_ids
.
add
(
tuple
(
ent
))
return
batch_entity_ids
class
Model
(
BaseModel
):
def
__init__
(
self
,
config_path
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
None
,
segment_vocab_size
=
0
)
# embedding_dims:768, len_categories: 7
self
.
fc
=
nn
.
Linear
(
768
,
7
)
# 包含首尾
self
.
crf
=
CRF
(
7
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
build_model
(
config_path
,
checkpoint_path
):
model
=
Model
(
config_path
).
to
(
"cpu"
)
model
.
load_weights
(
checkpoint_path
,
strict
=
False
)
return
model
if
__name__
==
'__main__'
:
ptmodel
=
build_model
(
"/datasets/bert-base-chinese/config.json"
,
"/models/best_model.pt"
)
crf
=
ptmodel
.
crf
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw
=
open
(
os
.
path
.
join
(
'log/'
,
f
'time.txt'
),
'a'
,
encoding
=
'utf-8'
)
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'Start Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
\n
'
)
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'End Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
time_fw
.
flush
()
time_fw
.
close
()
bert/bert4torch_cmcc/examples/sequence_labeling/bert_migraphx.py.bak
0 → 100644
View file @
c007ba1a
#
! -*- coding:utf-8 -*-
#
bert
+
crf
用来做实体识别
#
数据集:
http
://
s3
.
bmio
.
net
/
kashgari
/
china
-
people
-
daily
-
ner
-
corpus
.
tar
.
gz
#
[
valid_f1
]
token_level
:
97.06
;
entity_level
:
95.90
import
numpy
as
np
import
torch
from
torch
.
utils
.
data
import
DataLoader
from
apex
.
optimizers
import
FusedLAMB
import
apex_C
from
apex
import
amp
import
migraphx
import
torch
.
nn
as
nn
import
torch
.
optim
as
optim
from
bert4torch
.
snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch
.
layers
import
CRF
from
bert4torch
.
tokenizers
import
Tokenizer
from
bert4torch
.
models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
bert4torch
.
models
import
BaseModelDDP
import
os
import
time
maxlen
=
256
batch_size
=
64
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
#
BERT
base
config_path
=
'/datasets/bert-base-chinese/config.json'
dict_path
=
'/datasets/bert-base-chinese/vocab.txt'
device
=
"cuda"
gpuid
=
os
.
getenv
(
'HIP_VISIBLE_DEVICES'
)
labdir
=
os
.
path
.
join
(
'results'
,
gpuid
,
'label'
)
resultdir
=
os
.
path
.
join
(
'results'
,
gpuid
,
'data'
)
os
.
makedirs
(
resultdir
,
exist_ok
=
True
)
os
.
makedirs
(
labdir
,
exist_ok
=
True
)
#
加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'\n\n'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'\n'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
#
建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
if
os
.
path
.
isfile
(
"/workspace/bert4torch/examples/sequence_labeling/bert_best.mxr"
):
model
=
migraphx
.
load
(
"/workspace/bert4torch/examples/sequence_labeling/bert_best.mxr"
)
else
:
#
加载模型
maxInput
={
"input"
:[
64
,
256
]}
model
=
migraphx
.
parse_onnx
(
"/workspace/bert4torch/examples/sequence_labeling/bert_best.onnx"
,
map_input_dims
=
maxInput
)
migraphx
.
quantize_fp16
(
model
)
#
编译
model
.
compile
(
migraphx
.
get_target
(
"gpu"
),
device_id
=
0
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
#
转换数据集
valid_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
,
shuffle
=
False
,
drop_last
=
True
)
crf
=
CRF
(
len
(
categories
)).
to
(
device
)
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
end
=
0
infer_times
=
[]
total_infer_times
=
[]
data_idx
=
0
total_start
=
time
.
time
()
for
token_ids
,
label
in
tqdm
(
data
):
data_numpy
=
token_ids
.
detach
().
cpu
().
numpy
()
device
=
torch
.
device
(
"cuda"
)
#
注意:这里需要执行赋值操作,否则会造成
migraphx
中输入数据步长不对
img_data
=
np
.
zeros
(
data_numpy
.
shape
).
astype
(
"int64"
)
for
i
in
range
(
data_numpy
.
shape
[
0
]):
img_data
[
i
,
:]
=
data_numpy
[
i
,
:]
start
=
time
.
time
()
result
=
model
.
run
({
"input"
:
img_data
})
end
+=
time
.
time
()
-
start
infer_times
.
append
(
time
.
time
()
-
start
)
total_infer_times
.
append
(
time
.
time
()
-
total_start
)
emission_score
=
torch
.
from_numpy
(
np
.
array
(
result
[
0
],
copy
=
False
)).
to
(
device
)
attention_mask
=
torch
.
from_numpy
(
np
.
array
(
result
[
1
],
copy
=
False
)).
to
(
device
)
scores
=
crf
.
decode
(
emission_score
,
attention_mask
)
attention_mask
=
label
.
gt
(
0
)
#
token
粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
#
entity
粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
#
保存
bin
文件
score
=
np
.
array
(
result
[
0
])
score
=
np
.
pad
(
score
,
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
),
(
0
,
0
)),
'constant'
)
mask
=
np
.
array
(
result
[
1
])
mask
=
np
.
pad
(
mask
,
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
)),
'constant'
)
#
score
=
np
.
pad
(
np
.
array
(
result
[
0
]),
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
),
(
0
,
0
),
'constant'
))
#
mask
=
np
.
pad
(
np
.
array
(
result
[
1
]),
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
),
(
0
,
0
),
'constant'
))
label
=
label
.
cpu
().
numpy
()
label
=
np
.
pad
(
label
,
((
0
,
batch_size
-
emission_score
.
shape
[
0
]),
(
0
,
0
)),
'constant'
,
constant_values
=-
1
)
score
.
tofile
(
f
'{resultdir}/{data_idx}_0.bin'
)
mask
.
tofile
(
f
'{resultdir}/{data_idx}_1.bin'
)
label
.
tofile
(
f
'{labdir}/{data_idx}.bin'
)
#
np
.
array
(
result
[
0
],
copy
=
False
).
tofile
(
f
'{resultdir}/{data_idx}_0.bin'
)
#
np
.
array
(
result
[
1
],
copy
=
False
).
tofile
(
f
'{resultdir}/{data_idx}_1.bin'
)
#
label
.
cpu
().
numpy
().
tofile
(
f
'{labdir}/{data_idx}.bin'
)
data_idx
+=
1
total_start
=
time
.
time
()
print
(
"total_sample_data:"
,
(
64
*
data_idx
))
#
avg_infer_time
=
sum
(
infer_times
[
1
:])
/
len
(
infer_times
[
1
:])
avg_infer_time
=
64
*
len
(
infer_times
)
/
sum
(
infer_times
)
print
(
f
"total_infer_time: {end}s"
)
print
(
f
'avg_infer_fps: {avg_infer_time}samples/s'
)
load_data_infer_time
=
sum
(
total_infer_times
)
load_data_avg_infer_time
=
len
(
total_infer_times
)
*
64
/
sum
(
total_infer_times
)
print
(
f
'load_data_total_infer_time: {load_data_infer_time}s'
)
print
(
f
'load_data_avg_total_Infer_fps: {load_data_avg_infer_time} samples/s'
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
#
B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]==
entity_ids
[-
1
][-
1
]):
#
I
entity_ids
[-
1
][-
2
]
=
j
elif
len
(
entity_ids
[-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
if
__name__
==
'__main__'
:
#
time_fw
为存储时间日志的文件对象,文件绝对路径为
'log/time/time.txt'
time_fw
=
open
(
os
.
path
.
join
(
'log/'
,
f
'time.txt'
),
'a'
,
encoding
=
'utf-8'
)
#
time_fw
写入程序开始执行的时间
time_fw
.
write
(
'Start Time: {:.6f}\n'
.
format
(
time
.
time
()))
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
print
(
f
'[val-token level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}'
)
print
(
f
'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f}\n'
)
#
time_fw
写入程序开始执行的时间
time_fw
.
write
(
'End Time: {:.6f}\n'
.
format
(
time
.
time
()))
time_fw
.
flush
()
time_fw
.
close
()
bert/bert4torch_cmcc/examples/sequence_labeling/bert_postprocess.py
0 → 100644
View file @
c007ba1a
import
os
import
numpy
as
np
from
sklearn.metrics
import
classification_report
# ===================== 没有载入model.pt,结果偏低 =====================
result_dir
=
"results/0"
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
maxlen
=
256
batch_size
=
64
label_dir
=
os
.
path
.
join
(
result_dir
,
"label"
)
data_dir
=
os
.
path
.
join
(
result_dir
,
"data"
)
# ===================== 加载标签 =====================
def
load_all_labels
():
y_true
=
[]
files
=
sorted
(
os
.
listdir
(
label_dir
),
key
=
lambda
x
:
int
(
os
.
path
.
splitext
(
x
)[
0
]))
for
f
in
files
:
arr
=
np
.
fromfile
(
os
.
path
.
join
(
label_dir
,
f
),
dtype
=
np
.
int64
).
reshape
(
-
1
,
maxlen
)
for
seq
in
arr
:
y_true
.
extend
(
seq
[
seq
!=
-
1
].
tolist
())
return
y_true
# ===================== 加载预测 =====================
def
load_all_preds
():
y_pred
=
[]
bin_files
=
sorted
([
f
for
f
in
os
.
listdir
(
data_dir
)
if
f
.
endswith
(
"_0.bin"
)],
key
=
lambda
x
:
int
(
x
.
split
(
"_"
)[
0
]))
for
fname
in
bin_files
:
idx
=
int
(
fname
.
split
(
"_"
)[
0
])
emit
=
np
.
fromfile
(
os
.
path
.
join
(
data_dir
,
f
"
{
idx
}
_0.bin"
),
dtype
=
np
.
float32
).
reshape
(
batch_size
,
maxlen
,
7
)
pred
=
np
.
argmax
(
emit
,
axis
=-
1
)
for
seq
in
pred
:
y_pred
.
extend
(
seq
.
tolist
())
return
y_pred
# ===================== 合并BIO =====================
def
merge_bio
(
seq
):
res
=
[]
for
x
in
seq
:
if
x
in
(
1
,
2
):
res
.
append
(
"LOC"
)
elif
x
in
(
3
,
4
):
res
.
append
(
"PER"
)
elif
x
in
(
5
,
6
):
res
.
append
(
"ORG"
)
else
:
res
.
append
(
"O"
)
return
res
# ===================== 主程序 =====================
if
__name__
==
"__main__"
:
y_true
=
load_all_labels
()
y_pred
=
load_all_preds
()
L
=
min
(
len
(
y_true
),
len
(
y_pred
))
y_true
=
y_true
[:
L
]
y_pred
=
y_pred
[:
L
]
y_true_ent
=
merge_bio
(
y_true
)
y_pred_ent
=
merge_bio
(
y_pred
)
print
(
"
\n
"
+
"="
*
60
)
print
(
classification_report
(
y_true_ent
,
y_pred_ent
,
digits
=
4
))
print
(
"="
*
60
)
\ No newline at end of file
bert/bert4torch_cmcc/examples/sequence_labeling/bert_to_onnx.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
from
apex.optimizers
import
FusedLAMB
import
apex_C
from
apex
import
amp
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
bert4torch.models
import
BaseModelDDP
import
os
import
time
maxlen
=
256
batch_size
=
64
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'/datasets/bert-base-chinese/config.json'
checkpoint_path
=
"/models/best_model.pt"
dict_path
=
'/datasets/bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
#local_rank = int(os.environ['LOCAL_RANK'])
#print("local_rank ", local_rank)
#torch.cuda.set_device(local_rank)
#device = torch.device("cuda", local_rank)
#torch.distributed.init_process_group(backend='nccl')
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
#train_dataset = MyDataset('/workspace/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
#train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
## 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
module
.
crf
(
*
outputs
,
labels
)
#try to use apex
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
6e-5
)
#model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", master_weights=True, verbosity=0)
#model = BaseModelDDP(model, master_rank=0, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=False)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optimizer
,
# use_apex=True, #此处设置是否采用apex_amp的混合精度
)
#------------------------------------------------------------
def
evaluate
(
data
):
# for token_ids, label in tqdm(data):
# #torch.onnx.export(model.module, token_ids, "./bert_best.onnx", opset_version=13,
# torch.onnx.export(model, token_ids, "./bert_best.onnx", opset_version=13,
# input_names=['input'],
# output_names=['output'],
# dynamic_axes={'input': {1: 'token'}}) # 第一维可变,第0维默认维batch
# print("完成onnx模型转换")
# break
model
.
eval
()
dummy_input
=
torch
.
randint
(
1
,
2000
,
size
=
(
64
,
256
),
dtype
=
torch
.
long
,
device
=
device
)
torch
.
onnx
.
export
(
model
,
dummy_input
,
"/models/onnx-models/bert_best_static.onnx"
,
opset_version
=
13
,
input_names
=
[
"input"
],
output_names
=
[
"emission_scores"
,
"attention_mask"
],
# 更准确的输出名
do_constant_folding
=
True
,
)
print
(
"✅ 静态 ONNX 导出完成!"
)
# for token_ids, label in tqdm(data):
# #torch.onnx.export(model.module, token_ids, "./bert_best.onnx", opset_version=13,
# torch.onnx.export(model, token_ids, "./bert_best_1.onnx", opset_version=13,
# input_names=['input'],
# output_names=['output'],
# dynamic_axes={'input': {1: 'token'}}, # 第一维可变,第0维默认维batch
# do_constant_folding=True) # 启用常量折叠,减少运行时计算
# print("完成onnx模型转换")
# break
if
__name__
==
'__main__'
:
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw
=
open
(
os
.
path
.
join
(
'log/'
,
f
'time.txt'
),
'a'
,
encoding
=
'utf-8'
)
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'Start Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
model
.
load_weights
(
"/models/best_model.pt"
)
evaluate
(
valid_dataloader
)
bert/bert4torch_cmcc/examples/sequence_labeling/bertbase_postprocess.py
0 → 100644
View file @
c007ba1a
import
os
import
json
import
argparse
from
tqdm
import
tqdm
import
numpy
as
np
import
torch
from
seqeval.metrics
import
classification_report
from
seqeval.scheme
import
IOB2
import
torch.nn
as
nn
from
bert4torch.layers
import
CRF
from
bert4torch.models
import
build_transformer_model
,
BaseModel
""" 运行命令
python bertbase_postprocess.py -i results/0/data -l results/0/label -o output -cbert-base-chinese/config.json -k best_model.pt
"""
class
Model
(
BaseModel
):
def
__init__
(
self
,
config_path
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
None
,
segment_vocab_size
=
0
)
# embedding_dims:768, len_categories: 7
self
.
fc
=
nn
.
Linear
(
768
,
7
)
# 包含首尾
self
.
crf
=
CRF
(
7
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
build_model
(
config_path
,
checkpoint_path
):
model
=
Model
(
config_path
).
to
(
"cpu"
)
model
.
load_weights
(
checkpoint_path
,
strict
=
False
)
return
model
def
pad_data
(
data
,
seq
=
256
):
if
len
(
data
.
shape
)
==
1
:
return
np
.
pad
(
data
,
((
0
,
seq
-
data
.
shape
[
0
])),
"constant"
,
constant_values
=
(
0
))
elif
len
(
data
.
shape
)
==
2
:
return
np
.
pad
(
data
,
((
0
,
0
),
(
0
,
seq
-
data
.
shape
[
1
])),
"constant"
,
constant_values
=
(
0
))
else
:
# shape(bs, seq, len(categories))
return
np
.
pad
(
data
,
((
0
,
0
),
(
0
,
seq
-
data
.
shape
[
1
]),
(
0
,
0
)),
"constant"
,
constant_values
=
(
0
))
def
pad_data_npy
(
path
,
seq
=
256
):
return
pad_data
(
np
.
load
(
path
),
seq
)
def
pad_data_bin
(
path
,
output
,
bs
,
seq
=
256
,
len_catagory
=
7
):
data
=
None
if
output
==
"emission_score"
:
data
=
np
.
fromfile
(
path
,
dtype
=
np
.
float32
).
reshape
((
bs
,
-
1
,
len_catagory
))
else
:
data
=
np
.
fromfile
(
path
,
dtype
=
np
.
int64
).
reshape
((
bs
,
-
1
))
return
pad_data
(
data
,
seq
)
def
evaluate
(
result_dir
,
label_dir
,
bs
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
true_labels
,
true_predictions
=
[],
[]
data_num
=
len
(
os
.
listdir
(
label_dir
))
emission_score
=
None
labels
=
None
attention_mask
=
None
for
data_idx
in
tqdm
(
range
(
data_num
)):
emission_score_path
=
[
os
.
path
.
join
(
result_dir
,
f
"
{
data_idx
}
_0.
{
fmt
}
"
)
for
fmt
in
[
"npy"
,
"bin"
]
]
if
os
.
path
.
exists
(
emission_score_path
[
0
]):
emission_score
=
pad_data_npy
(
emission_score_path
[
0
])
else
:
print
(
emission_score_path
[
1
])
emission_score
=
pad_data_bin
(
emission_score_path
[
1
],
"emission_score"
,
bs
)
attention_mask_path
=
[
os
.
path
.
join
(
result_dir
,
f
"
{
data_idx
}
_1.
{
fmt
}
"
)
for
fmt
in
[
"npy"
,
"bin"
]
]
if
os
.
path
.
exists
(
attention_mask_path
[
0
]):
attention_mask
=
pad_data_npy
(
attention_mask_path
[
0
])
else
:
attention_mask
=
pad_data_bin
(
attention_mask_path
[
1
],
"attention_mask"
,
bs
)
label_path
=
[
os
.
path
.
join
(
label_dir
,
f
"
{
data_idx
}
.
{
fmt
}
"
)
for
fmt
in
[
"npy"
,
"bin"
]
]
if
os
.
path
.
exists
(
label_path
[
0
]):
labels
=
pad_data_npy
(
label_path
[
0
])
else
:
labels
=
pad_data_bin
(
label_path
[
1
],
"labels"
,
bs
)
labels
=
torch
.
Tensor
(
labels
)
# mask last data
data_mask
=
labels
[:,
0
]
>=
0
labels
=
labels
[
data_mask
]
emission_score
=
torch
.
Tensor
(
emission_score
)[
data_mask
]
attention_mask
=
torch
.
Tensor
(
attention_mask
)[
data_mask
]
scores
=
crf
.
decode
(
emission_score
,
attention_mask
)
true_label
=
[]
for
label
in
labels
:
true_label
+=
[
categories_id2label
[
int
(
l
)]
for
l
in
label
if
l
!=
-
100
]
true_labels
.
append
(
true_label
)
true_prediction
=
[]
for
score
in
scores
:
true_prediction
+=
[
categories_id2label
[
int
(
p
)]
for
p
in
score
if
p
!=
-
100
]
true_predictions
.
append
(
true_prediction
)
attention_mask
=
labels
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
labels
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
labels
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
labels
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
eval_result
=
classification_report
(
true_labels
,
true_predictions
,
digits
=
4
,
mode
=
'strict'
,
scheme
=
IOB2
)
print
(
eval_result
)
f1
,
p1
,
r1
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
p2
,
r2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
print
(
"val-token level: f1:{}, precision: {}, recall:{}"
.
format
(
f1
,
p1
,
r1
))
print
(
"val-entity level: f1:{}, precision: {}, recall:{}"
.
format
(
f2
,
p2
,
r2
))
result_dict
=
{
"seqeval_result"
:
eval_result
,
"val-token level"
:
{
"f1"
:
f1
,
"precision"
:
p1
,
"recall"
:
r1
},
"val-entity level"
:
{
"f1"
:
f2
,
"precision"
:
p2
,
"recall"
:
r2
}
}
return
result_dict
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
\
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Bert_Base_Chinese postprocess for sequence labeling task.'
)
parser
.
add_argument
(
'-i'
,
'--result_dir'
,
type
=
str
,
required
=
True
,
help
=
'result dir for prediction results'
)
parser
.
add_argument
(
'-o'
,
'--out_path'
,
type
=
str
,
required
=
True
,
help
=
'save path for evaluation result'
)
parser
.
add_argument
(
'-l'
,
'--label_dir'
,
type
=
str
,
required
=
True
,
help
=
'label dir for label results'
)
parser
.
add_argument
(
'-c'
,
'--config_path'
,
type
=
str
,
required
=
True
,
help
=
'config path for export model'
)
parser
.
add_argument
(
'-k'
,
'--ckpt_path'
,
type
=
str
,
default
=
"./best_model.pt"
,
help
=
'result dir for prediction results'
)
parser
.
add_argument
(
'-bs'
,
'--batch_size'
,
type
=
int
,
default
=
64
,
help
=
'Batch size of output data.'
)
arguments
=
parser
.
parse_args
()
arguments
.
out_path
=
os
.
path
.
abspath
(
arguments
.
out_path
)
dir_name
=
os
.
path
.
dirname
(
arguments
.
out_path
)
if
not
os
.
path
.
exists
(
dir_name
):
os
.
makedirs
(
dir_name
)
return
arguments
if
__name__
==
'__main__'
:
args
=
parse_arguments
()
model
=
build_model
(
args
.
config_path
,
args
.
ckpt_path
)
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
crf
=
model
.
crf
evaluate_results
=
evaluate
(
args
.
result_dir
,
args
.
label_dir
,
args
.
batch_size
)
with
open
(
args
.
out_path
,
'w'
)
as
f
:
json
.
dump
(
evaluate_results
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
bert/bert4torch_cmcc/examples/sequence_labeling/onnx_convert.sh
0 → 100644
View file @
c007ba1a
python3
-m
onnxruntime.transformers.optimizer
\
--input
/home/sunzhq/workspace/onnx_models/bert/bert_best.onnx
\
--output
/home/sunzhq/workspace/onnx_models/bert/bert_best_fused.onnx
\
--use_multi_head_attention
\
--num_heads
12
\
--hidden_size
768
\
--model_type
bert
\
--disable_skip_layer_norm
\
--disable_gelu
\
--use_gpu
\
--disable_embed_layer_norm
\
--use_mask_index
\
--use_raw_attention_mask
# --no_attention_mask \
\ No newline at end of file
bert/bert4torch_cmcc/examples/sequence_labeling/onnx_inference.sh
0 → 100644
View file @
c007ba1a
source
/opt/dtk/env.sh
export
HIP_PRINTF_DEBUG_FOR_FP64
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIGRAPHX_ENABLE_GEMM_SOFTMAX_GEMM_FUSE
=
1
# 性能提升,影响精度
export
MIGRAPHX_ENABLE_MHA
=
1
export
HIP_VISIBLE_DEVICES
=
3
numactl
-N
3
-m
3 python bert_migraphx.py
# export HIP_VISIBLE_DEVICES=0
# nohup numactl -N 0 -m 0 python bert_migraphx.py 2>&1 | tee result_0.log &
# export HIP_VISIBLE_DEVICES=1
# nohup numactl -N 1 -m 1 python bert_migraphx.py 2>&1 | tee result_1.log &
# export HIP_VISIBLE_DEVICES=2
# nohup numactl -N 2 -m 2 python bert_migraphx.py 2>&1 | tee result_2.log &
# export HIP_VISIBLE_DEVICES=3
# nohup numactl -N 3 -m 3 python bert_migraphx.py 2>&1 | tee result_3.log &
# export HIP_VISIBLE_DEVICES=4
# nohup python bert_migraphx.py 2>&1 | tee result_4.log &
# export HIP_VISIBLE_DEVICES=5
# nohup python bert_migraphx.py 2>&1 | tee result_5.log &
# export HIP_VISIBLE_DEVICES=6
# nohup python bert_migraphx.py 2>&1 | tee result_6.log &
# export HIP_VISIBLE_DEVICES=7
# nohup python bert_migraphx.py 2>&1 | tee result_7.log &
bert/bert4torch_cmcc/examples/sequence_labeling/outpath
0 → 100644
View file @
c007ba1a
{
"seqeval_result": " precision recall f1-score support\n\n LOC 0.9662 0.9703 0.9683 1887\n ORG 0.9289 0.9431 0.9360 984\n PER 0.9673 0.9706 0.9689 884\n\n micro avg 0.9566 0.9632 0.9599 3755\n macro avg 0.9542 0.9613 0.9577 3755\nweighted avg 0.9567 0.9632 0.9600 3755\n",
"val-token level": {
"f1": 0.9724224643755242,
"precision": 0.9683639398998333,
"recall": 0.9765151515151517
},
"val-entity level": {
"f1": 0.9599256900212325,
"precision": 0.9566252314202603,
"recall": 0.9632490013315589
}
}
\ No newline at end of file
bert/bert4torch_cmcc/examples/sequence_labeling/output
0 → 100644
View file @
c007ba1a
{
"seqeval_result": " precision recall f1-score support\n\n LOC 0.9691 0.9712 0.9702 1876\n ORG 0.9286 0.9409 0.9347 982\n PER 0.9659 0.9692 0.9675 876\n\n micro avg 0.9576 0.9628 0.9602 3734\n macro avg 0.9546 0.9604 0.9575 3734\nweighted avg 0.9577 0.9628 0.9602 3734\n",
"val-token level": {
"f1": 0.9726518056550506,
"precision": 0.9692617787855886,
"recall": 0.9760656292286877
},
"val-entity level": {
"f1": 0.9602029914529925,
"precision": 0.957645178476293,
"recall": 0.9627745045527595
}
}
\ No newline at end of file
bert/bert4torch_cmcc/examples/sequence_labeling/post.sh
0 → 100644
View file @
c007ba1a
for
index
in
{
0..7
}
do
python bertbase_postprocess.py
-i
results/
${
index
}
/data
-l
results/
${
index
}
/label
-o
output
-cbert-base-chinese
/config.json
-k
best_model.pt
done
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_W2NER.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# W2NER: https://github.com/ljynlp/W2NER
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.optimizers
import
get_linear_schedule_with_warmup
from
bert4torch.layers
import
LayerNorm
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
torch.nn.utils.rnn
import
pack_padded_sequence
,
pad_packed_sequence
from
collections
import
defaultdict
,
deque
from
sklearn.metrics
import
precision_recall_fscore_support
# 模型参数:训练
epochs
=
20
# 训练轮数
steps_per_epoch
=
100
# 每轮步数
maxlen
=
256
# 最大长度
batch_size
=
8
# 根据gpu显存设置
learning_rate
=
1e-3
clip_grad_norm
=
5.0
bert_learning_rate
=
5e-6
warm_factor
=
0.1
weight_decay
=
0
use_bert_last_4_layers
=
True
categories
=
{
'LOC'
:
2
,
'PER'
:
3
,
'ORG'
:
4
}
label_num
=
len
(
categories
)
+
2
# 模型参数:网络结构
dist_emb_size
=
20
type_emb_size
=
20
lstm_hid_size
=
512
conv_hid_size
=
96
bert_hid_size
=
768
biaffine_size
=
512
ffnn_hid_size
=
288
dilation
=
[
1
,
2
,
3
]
emb_dropout
=
0.5
conv_dropout
=
0.5
out_dropout
=
0.33
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/bert4torch_pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 相对距离设置
dis2idx
=
np
.
zeros
((
1000
),
dtype
=
'int64'
)
dis2idx
[
1
]
=
1
dis2idx
[
2
:]
=
2
dis2idx
[
4
:]
=
3
dis2idx
[
8
:]
=
4
dis2idx
[
16
:]
=
5
dis2idx
[
32
:]
=
6
dis2idx
[
64
:]
=
7
dis2idx
[
128
:]
=
8
dis2idx
[
256
:]
=
9
# 用到的小函数
def
convert_index_to_text
(
index
,
type
):
text
=
"-"
.
join
([
str
(
i
)
for
i
in
index
])
text
=
text
+
"-#-{}"
.
format
(
type
)
return
text
def
convert_text_to_index
(
text
):
index
,
type
=
text
.
split
(
"-#-"
)
index
=
[
int
(
x
)
for
x
in
index
.
split
(
"-"
)]
return
index
,
int
(
type
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
tqdm
(
f
.
split
(
'
\n\n
'
),
desc
=
'Load data'
):
if
not
l
:
continue
sentence
,
d
=
[],
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
sentence
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
if
len
(
sentence
)
>
maxlen
-
2
:
continue
tokens
=
[
tokenizer
.
tokenize
(
word
)[
1
:
-
1
]
for
word
in
sentence
[:
maxlen
-
2
]]
pieces
=
[
piece
for
pieces
in
tokens
for
piece
in
pieces
]
tokens_ids
=
[
tokenizer
.
_token_start_id
]
+
tokenizer
.
tokens_to_ids
(
pieces
)
+
[
tokenizer
.
_token_end_id
]
assert
len
(
tokens_ids
)
<=
maxlen
length
=
len
(
tokens
)
# piece和word的对应关系,中文两者一致,除了[CLS]和[SEP]
_pieces2word
=
np
.
zeros
((
length
,
len
(
tokens_ids
)),
dtype
=
np
.
bool
)
e_start
=
0
for
i
,
pieces
in
enumerate
(
tokens
):
if
len
(
pieces
)
==
0
:
continue
pieces
=
list
(
range
(
e_start
,
e_start
+
len
(
pieces
)))
_pieces2word
[
i
,
pieces
[
0
]
+
1
:
pieces
[
-
1
]
+
2
]
=
1
e_start
+=
len
(
pieces
)
# 相对距离
_dist_inputs
=
np
.
zeros
((
length
,
length
),
dtype
=
np
.
int
)
for
k
in
range
(
length
):
_dist_inputs
[
k
,
:]
+=
k
_dist_inputs
[:,
k
]
-=
k
for
i
in
range
(
length
):
for
j
in
range
(
length
):
if
_dist_inputs
[
i
,
j
]
<
0
:
_dist_inputs
[
i
,
j
]
=
dis2idx
[
-
_dist_inputs
[
i
,
j
]]
+
9
else
:
_dist_inputs
[
i
,
j
]
=
dis2idx
[
_dist_inputs
[
i
,
j
]]
_dist_inputs
[
_dist_inputs
==
0
]
=
19
# golden标签
_grid_labels
=
np
.
zeros
((
length
,
length
),
dtype
=
np
.
int
)
_grid_mask2d
=
np
.
ones
((
length
,
length
),
dtype
=
np
.
bool
)
for
entity
in
d
:
e_start
,
e_end
,
e_type
=
entity
[
0
],
entity
[
1
]
+
1
,
entity
[
-
1
]
if
e_end
>=
maxlen
-
2
:
continue
index
=
list
(
range
(
e_start
,
e_end
))
for
i
in
range
(
len
(
index
)):
if
i
+
1
>=
len
(
index
):
break
_grid_labels
[
index
[
i
],
index
[
i
+
1
]]
=
1
_grid_labels
[
index
[
-
1
],
index
[
0
]]
=
categories
[
e_type
]
_entity_text
=
set
([
convert_index_to_text
(
list
(
range
(
e
[
0
],
e
[
1
]
+
1
)),
categories
[
e
[
-
1
]])
for
e
in
d
])
D
.
append
((
tokens_ids
,
_pieces2word
,
_dist_inputs
,
_grid_labels
,
_grid_mask2d
,
_entity_text
))
return
D
def
collate_fn
(
data
):
tokens_ids
,
pieces2word
,
dist_inputs
,
grid_labels
,
grid_mask2d
,
_entity_text
=
map
(
list
,
zip
(
*
data
))
sent_length
=
torch
.
tensor
([
i
.
shape
[
0
]
for
i
in
pieces2word
],
dtype
=
torch
.
long
,
device
=
device
)
# max_wordlen: word长度,非token长度,max_tokenlen:token长度
max_wordlen
=
torch
.
max
(
sent_length
).
item
()
max_tokenlen
=
np
.
max
([
len
(
x
)
for
x
in
tokens_ids
])
tokens_ids
=
torch
.
tensor
(
sequence_padding
(
tokens_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_size
=
tokens_ids
.
size
(
0
)
def
fill
(
data
,
new_data
):
for
j
,
x
in
enumerate
(
data
):
new_data
[
j
,
:
x
.
shape
[
0
],
:
x
.
shape
[
1
]]
=
torch
.
tensor
(
x
,
dtype
=
torch
.
long
,
device
=
device
)
return
new_data
dis_mat
=
torch
.
zeros
((
batch_size
,
max_wordlen
,
max_wordlen
),
dtype
=
torch
.
long
,
device
=
device
)
dist_inputs
=
fill
(
dist_inputs
,
dis_mat
)
labels_mat
=
torch
.
zeros
((
batch_size
,
max_wordlen
,
max_wordlen
),
dtype
=
torch
.
long
,
device
=
device
)
grid_labels
=
fill
(
grid_labels
,
labels_mat
)
mask2d_mat
=
torch
.
zeros
((
batch_size
,
max_wordlen
,
max_wordlen
),
dtype
=
torch
.
bool
,
device
=
device
)
grid_mask2d
=
fill
(
grid_mask2d
,
mask2d_mat
)
sub_mat
=
torch
.
zeros
((
batch_size
,
max_wordlen
,
max_tokenlen
),
dtype
=
torch
.
bool
,
device
=
device
)
pieces2word
=
fill
(
pieces2word
,
sub_mat
)
return
[
tokens_ids
,
pieces2word
,
dist_inputs
,
sent_length
,
grid_mask2d
],
[
grid_labels
,
grid_mask2d
,
_entity_text
]
# 加载数据
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
ConvolutionLayer
(
nn
.
Module
):
'''卷积层
'''
def
__init__
(
self
,
input_size
,
channels
,
dilation
,
dropout
=
0.1
):
super
(
ConvolutionLayer
,
self
).
__init__
()
self
.
base
=
nn
.
Sequential
(
nn
.
Dropout2d
(
dropout
),
nn
.
Conv2d
(
input_size
,
channels
,
kernel_size
=
1
),
nn
.
GELU
(),
)
self
.
convs
=
nn
.
ModuleList
(
[
nn
.
Conv2d
(
channels
,
channels
,
kernel_size
=
3
,
groups
=
channels
,
dilation
=
d
,
padding
=
d
)
for
d
in
dilation
])
def
forward
(
self
,
x
):
x
=
x
.
permute
(
0
,
3
,
1
,
2
).
contiguous
()
x
=
self
.
base
(
x
)
outputs
=
[]
for
conv
in
self
.
convs
:
x
=
conv
(
x
)
x
=
F
.
gelu
(
x
)
outputs
.
append
(
x
)
outputs
=
torch
.
cat
(
outputs
,
dim
=
1
)
outputs
=
outputs
.
permute
(
0
,
2
,
3
,
1
).
contiguous
()
return
outputs
class
Biaffine
(
nn
.
Module
):
'''仿射变换
'''
def
__init__
(
self
,
n_in
,
n_out
=
1
,
bias_x
=
True
,
bias_y
=
True
):
super
(
Biaffine
,
self
).
__init__
()
self
.
n_in
=
n_in
self
.
n_out
=
n_out
self
.
bias_x
=
bias_x
self
.
bias_y
=
bias_y
weight
=
torch
.
zeros
((
n_out
,
n_in
+
int
(
bias_x
),
n_in
+
int
(
bias_y
)))
nn
.
init
.
xavier_normal_
(
weight
)
self
.
weight
=
nn
.
Parameter
(
weight
,
requires_grad
=
True
)
def
extra_repr
(
self
):
s
=
f
"n_in=
{
self
.
n_in
}
, n_out=
{
self
.
n_out
}
"
if
self
.
bias_x
:
s
+=
f
", bias_x=
{
self
.
bias_x
}
"
if
self
.
bias_y
:
s
+=
f
", bias_y=
{
self
.
bias_y
}
"
return
s
def
forward
(
self
,
x
,
y
):
if
self
.
bias_x
:
x
=
torch
.
cat
((
x
,
torch
.
ones_like
(
x
[...,
:
1
])),
-
1
)
if
self
.
bias_y
:
y
=
torch
.
cat
((
y
,
torch
.
ones_like
(
y
[...,
:
1
])),
-
1
)
# [batch_size, n_out, seq_len, seq_len]
s
=
torch
.
einsum
(
'bxi,oij,byj->boxy'
,
x
,
self
.
weight
,
y
)
# remove dim 1 if n_out == 1
s
=
s
.
permute
(
0
,
2
,
3
,
1
)
return
s
class
MLP
(
nn
.
Module
):
'''MLP全连接
'''
def
__init__
(
self
,
n_in
,
n_out
,
dropout
=
0
):
super
().
__init__
()
self
.
linear
=
nn
.
Linear
(
n_in
,
n_out
)
self
.
activation
=
nn
.
GELU
()
self
.
dropout
=
nn
.
Dropout
(
dropout
)
def
forward
(
self
,
x
):
x
=
self
.
dropout
(
x
)
x
=
self
.
linear
(
x
)
x
=
self
.
activation
(
x
)
return
x
class
CoPredictor
(
nn
.
Module
):
def
__init__
(
self
,
cls_num
,
hid_size
,
biaffine_size
,
channels
,
ffnn_hid_size
,
dropout
=
0
):
super
().
__init__
()
self
.
mlp1
=
MLP
(
n_in
=
hid_size
,
n_out
=
biaffine_size
,
dropout
=
dropout
)
self
.
mlp2
=
MLP
(
n_in
=
hid_size
,
n_out
=
biaffine_size
,
dropout
=
dropout
)
self
.
biaffine
=
Biaffine
(
n_in
=
biaffine_size
,
n_out
=
cls_num
,
bias_x
=
True
,
bias_y
=
True
)
self
.
mlp_rel
=
MLP
(
channels
,
ffnn_hid_size
,
dropout
=
dropout
)
self
.
linear
=
nn
.
Linear
(
ffnn_hid_size
,
cls_num
)
self
.
dropout
=
nn
.
Dropout
(
dropout
)
def
forward
(
self
,
x
,
y
,
z
):
h
=
self
.
dropout
(
self
.
mlp1
(
x
))
t
=
self
.
dropout
(
self
.
mlp2
(
y
))
o1
=
self
.
biaffine
(
h
,
t
)
z
=
self
.
dropout
(
self
.
mlp_rel
(
z
))
o2
=
self
.
linear
(
z
)
return
o1
+
o2
class
Model
(
BaseModel
):
def
__init__
(
self
,
use_bert_last_4_layers
=
False
):
super
().
__init__
()
self
.
use_bert_last_4_layers
=
use_bert_last_4_layers
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
# segment_vocab_size=0,
output_all_encoded_layers
=
True
if
use_bert_last_4_layers
else
False
)
lstm_input_size
=
self
.
bert
.
configs
[
'hidden_size'
]
self
.
dis_embs
=
nn
.
Embedding
(
20
,
dist_emb_size
)
self
.
reg_embs
=
nn
.
Embedding
(
3
,
type_emb_size
)
self
.
encoder
=
nn
.
LSTM
(
lstm_input_size
,
lstm_hid_size
//
2
,
num_layers
=
1
,
batch_first
=
True
,
bidirectional
=
True
)
conv_input_size
=
lstm_hid_size
+
dist_emb_size
+
type_emb_size
self
.
convLayer
=
ConvolutionLayer
(
conv_input_size
,
conv_hid_size
,
dilation
,
conv_dropout
)
self
.
dropout
=
nn
.
Dropout
(
emb_dropout
)
self
.
predictor
=
CoPredictor
(
label_num
,
lstm_hid_size
,
biaffine_size
,
conv_hid_size
*
len
(
dilation
),
ffnn_hid_size
,
out_dropout
)
self
.
cln
=
LayerNorm
(
lstm_hid_size
,
conditional_size
=
lstm_hid_size
)
def
forward
(
self
,
token_ids
,
pieces2word
,
dist_inputs
,
sent_length
,
grid_mask2d
):
bert_embs
=
self
.
bert
([
token_ids
,
torch
.
zeros_like
(
token_ids
)])
if
self
.
use_bert_last_4_layers
:
bert_embs
=
torch
.
stack
(
bert_embs
[
-
4
:],
dim
=-
1
).
mean
(
-
1
)
length
=
pieces2word
.
size
(
1
)
min_value
=
torch
.
min
(
bert_embs
).
item
()
# 最大池化
_bert_embs
=
bert_embs
.
unsqueeze
(
1
).
expand
(
-
1
,
length
,
-
1
,
-
1
)
_bert_embs
=
torch
.
masked_fill
(
_bert_embs
,
pieces2word
.
eq
(
0
).
unsqueeze
(
-
1
),
min_value
)
word_reps
,
_
=
torch
.
max
(
_bert_embs
,
dim
=
2
)
# LSTM
word_reps
=
self
.
dropout
(
word_reps
)
packed_embs
=
pack_padded_sequence
(
word_reps
,
sent_length
.
cpu
(),
batch_first
=
True
,
enforce_sorted
=
False
)
packed_outs
,
(
hidden
,
_
)
=
self
.
encoder
(
packed_embs
)
word_reps
,
_
=
pad_packed_sequence
(
packed_outs
,
batch_first
=
True
,
total_length
=
sent_length
.
max
())
# 条件LayerNorm
cln
=
self
.
cln
([
word_reps
.
unsqueeze
(
2
),
word_reps
])
# concat
dis_emb
=
self
.
dis_embs
(
dist_inputs
)
tril_mask
=
torch
.
tril
(
grid_mask2d
.
clone
().
long
())
reg_inputs
=
tril_mask
+
grid_mask2d
.
clone
().
long
()
reg_emb
=
self
.
reg_embs
(
reg_inputs
)
conv_inputs
=
torch
.
cat
([
dis_emb
,
reg_emb
,
cln
],
dim
=-
1
)
# 卷积层
conv_inputs
=
torch
.
masked_fill
(
conv_inputs
,
grid_mask2d
.
eq
(
0
).
unsqueeze
(
-
1
),
0.0
)
conv_outputs
=
self
.
convLayer
(
conv_inputs
)
conv_outputs
=
torch
.
masked_fill
(
conv_outputs
,
grid_mask2d
.
eq
(
0
).
unsqueeze
(
-
1
),
0.0
)
# 输出层
outputs
=
self
.
predictor
(
word_reps
,
word_reps
,
conv_outputs
)
return
outputs
model
=
Model
(
use_bert_last_4_layers
).
to
(
device
)
class
Loss
(
nn
.
CrossEntropyLoss
):
def
forward
(
self
,
outputs
,
labels
):
grid_labels
,
grid_mask2d
,
_
=
labels
grid_mask2d
=
grid_mask2d
.
clone
()
return
super
().
forward
(
outputs
[
grid_mask2d
],
grid_labels
[
grid_mask2d
])
bert_params
=
set
(
model
.
bert
.
parameters
())
other_params
=
list
(
set
(
model
.
parameters
())
-
bert_params
)
no_decay
=
[
'bias'
,
'LayerNorm.weight'
]
params
=
[
{
'params'
:
[
p
for
n
,
p
in
model
.
bert
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'lr'
:
bert_learning_rate
,
'weight_decay'
:
weight_decay
},
{
'params'
:
[
p
for
n
,
p
in
model
.
bert
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'lr'
:
bert_learning_rate
,
'weight_decay'
:
0.0
},
{
'params'
:
other_params
,
'lr'
:
learning_rate
,
'weight_decay'
:
weight_decay
},
]
optimizer
=
optim
.
Adam
(
params
,
lr
=
learning_rate
,
weight_decay
=
weight_decay
)
updates_total
=
(
len
(
train_dataloader
)
if
steps_per_epoch
is
None
else
steps_per_epoch
)
*
epochs
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
warm_factor
*
updates_total
,
num_training_steps
=
updates_total
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optimizer
,
scheduler
=
scheduler
,
clip_grad_norm
=
5.0
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
p
,
r
,
e_f1
,
e_p
,
e_r
=
self
.
evaluate
(
valid_dataloader
)
if
e_f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
e_f1
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
p
:.
5
f
}
r:
{
r
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
e_f1
:.
5
f
}
, p:
{
e_p
:.
5
f
}
r:
{
e_r
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
def
evaluate
(
self
,
data_loader
):
def
cal_f1
(
c
,
p
,
r
):
if
r
==
0
or
p
==
0
:
return
0
,
0
,
0
r
=
c
/
r
if
r
else
0
p
=
c
/
p
if
p
else
0
if
r
and
p
:
return
2
*
p
*
r
/
(
p
+
r
),
p
,
r
return
0
,
p
,
r
pred_result
=
[]
label_result
=
[]
total_ent_r
=
0
total_ent_p
=
0
total_ent_c
=
0
for
data_batch
in
tqdm
(
data_loader
,
desc
=
'Evaluate'
):
(
token_ids
,
pieces2word
,
dist_inputs
,
sent_length
,
grid_mask2d
),
(
grid_labels
,
grid_mask2d
,
entity_text
)
=
data_batch
outputs
=
model
.
predict
([
token_ids
,
pieces2word
,
dist_inputs
,
sent_length
,
grid_mask2d
])
grid_mask2d
=
grid_mask2d
.
clone
()
outputs
=
torch
.
argmax
(
outputs
,
-
1
)
ent_c
,
ent_p
,
ent_r
,
_
=
self
.
decode
(
outputs
.
cpu
().
numpy
(),
entity_text
,
sent_length
.
cpu
().
numpy
())
total_ent_r
+=
ent_r
total_ent_p
+=
ent_p
total_ent_c
+=
ent_c
grid_labels
=
grid_labels
[
grid_mask2d
].
contiguous
().
view
(
-
1
)
outputs
=
outputs
[
grid_mask2d
].
contiguous
().
view
(
-
1
)
label_result
.
append
(
grid_labels
.
cpu
())
pred_result
.
append
(
outputs
.
cpu
())
label_result
=
torch
.
cat
(
label_result
)
pred_result
=
torch
.
cat
(
pred_result
)
p
,
r
,
f1
,
_
=
precision_recall_fscore_support
(
label_result
.
numpy
(),
pred_result
.
numpy
(),
average
=
"macro"
)
e_f1
,
e_p
,
e_r
=
cal_f1
(
total_ent_c
,
total_ent_p
,
total_ent_r
)
return
f1
,
p
,
r
,
e_f1
,
e_p
,
e_r
def
decode
(
self
,
outputs
,
entities
,
length
):
class
Node
:
def
__init__
(
self
):
self
.
THW
=
[]
# [(tail, type)]
self
.
NNW
=
defaultdict
(
set
)
# {(head,tail): {next_index}}
ent_r
,
ent_p
,
ent_c
=
0
,
0
,
0
decode_entities
=
[]
q
=
deque
()
for
instance
,
ent_set
,
l
in
zip
(
outputs
,
entities
,
length
):
predicts
=
[]
nodes
=
[
Node
()
for
_
in
range
(
l
)]
count
=
0
for
cur
in
reversed
(
range
(
l
)):
# if count >= 29:
# print(count)
count
+=
1
heads
=
[]
for
pre
in
range
(
cur
+
1
):
# THW
if
instance
[
cur
,
pre
]
>
1
:
nodes
[
pre
].
THW
.
append
((
cur
,
instance
[
cur
,
pre
]))
heads
.
append
(
pre
)
# NNW
if
pre
<
cur
and
instance
[
pre
,
cur
]
==
1
:
# cur node
for
head
in
heads
:
nodes
[
pre
].
NNW
[(
head
,
cur
)].
add
(
cur
)
# post nodes
for
head
,
tail
in
nodes
[
cur
].
NNW
.
keys
():
if
tail
>=
cur
and
head
<=
pre
:
nodes
[
pre
].
NNW
[(
head
,
tail
)].
add
(
cur
)
# entity
for
tail
,
type_id
in
nodes
[
cur
].
THW
:
if
cur
==
tail
:
predicts
.
append
(([
cur
],
type_id
))
continue
q
.
clear
()
q
.
append
([
cur
])
while
len
(
q
)
>
0
:
chains
=
q
.
pop
()
for
idx
in
nodes
[
chains
[
-
1
]].
NNW
[(
cur
,
tail
)]:
if
idx
==
tail
:
predicts
.
append
((
chains
+
[
idx
],
type_id
))
else
:
q
.
append
(
chains
+
[
idx
])
predicts
=
set
([
convert_index_to_text
(
x
[
0
],
x
[
1
])
for
x
in
predicts
])
decode_entities
.
append
([
convert_text_to_index
(
x
)
for
x
in
predicts
])
ent_r
+=
len
(
ent_set
)
ent_p
+=
len
(
predicts
)
ent_c
+=
len
(
predicts
.
intersection
(
ent_set
))
return
ent_c
,
ent_p
,
ent_r
,
decode_entities
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
steps_per_epoch
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_cascade_crf.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf 级联方法,一阶段识别BIO,二阶段识别对应的分类
# 参考博客:https://zhuanlan.zhihu.com/p/166496466
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 98.11; entity_level: 96.23
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'LOC'
,
'PER'
,
'ORG'
]
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
,
batch_entity_ids
,
batch_entity_labels
=
[],
[],
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
entity_ids
,
entity_labels
=
[],
[]
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
1
# 标记B
labels
[
start
+
1
:
end
+
1
]
=
2
# 标记I
entity_ids
.
append
([
start
,
end
])
entity_labels
.
append
(
categories
.
index
(
label
)
+
1
)
if
not
entity_ids
:
# 至少要有一个标签
entity_ids
.
append
([
0
,
0
])
# 如果没有则用0填充
entity_labels
.
append
(
0
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_entity_ids
.
append
(
entity_ids
)
batch_entity_labels
.
append
(
entity_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数,start/end]
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数]
return
[
batch_token_ids
,
batch_entity_ids
],
[
batch_labels
,
batch_entity_labels
]
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
dense1
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
dense2
=
nn
.
Linear
(
768
,
len
(
categories
)
+
1
)
# 包含padding
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
inputs
):
# 一阶段的输出
token_ids
,
entity_ids
=
inputs
[
0
],
inputs
[
1
]
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
dense1
(
last_hidden_state
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
# 二阶段输出
btz
,
entity_count
,
_
=
entity_ids
.
shape
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
entity_ids
=
entity_ids
.
reshape
(
btz
,
-
1
,
1
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
entity_ids
).
reshape
(
btz
,
entity_count
,
-
1
,
hidden_size
)
entity_states
=
torch
.
mean
(
entity_states
,
dim
=
2
)
# 取实体首尾hidden_states的均值
entity_logit
=
self
.
dense2
(
entity_states
)
# [btz, 实体个数,实体类型数]
return
emission_score
,
attention_mask
,
entity_logit
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
# 一阶段推理
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
dense1
(
last_hidden_state
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [bts, seq_len]
# 二阶段推理
batch_entity_ids
=
[]
for
one_samp
in
best_path
:
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
if
item
.
item
()
==
1
:
# B
entity_ids
.
append
([
j
,
j
])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
(
item
.
item
()
==
2
):
# I
entity_ids
[
-
1
][
-
1
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
if
not
entity_ids
:
# 至少要有一个标签
entity_ids
.
append
([
0
,
0
])
# 如果没有则用0填充
batch_entity_ids
.
append
([
i
for
i
in
entity_ids
if
i
])
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数,start/end]
btz
,
entity_count
,
_
=
batch_entity_ids
.
shape
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
gather_index
=
batch_entity_ids
.
reshape
(
btz
,
-
1
,
1
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
gather_index
).
reshape
(
btz
,
entity_count
,
-
1
,
hidden_size
)
entity_states
=
torch
.
mean
(
entity_states
,
dim
=
2
)
# 取实体首尾hidden_states的均值
entity_logit
=
self
.
dense2
(
entity_states
)
# [btz, 实体个数,实体类型数]
entity_pred
=
torch
.
argmax
(
entity_logit
,
dim
=-
1
)
# [btz, 实体个数]
# 每个元素为一个三元组
entity_tulpe
=
trans_entity2tuple
(
batch_entity_ids
,
entity_pred
)
return
best_path
,
entity_tulpe
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
loss2
=
nn
.
CrossEntropyLoss
(
ignore_index
=
0
)
def
forward
(
self
,
outputs
,
labels
):
emission_score
,
attention_mask
,
entity_logit
=
outputs
seq_labels
,
entity_labels
=
labels
loss1
=
model
.
crf
(
emission_score
,
attention_mask
,
seq_labels
)
loss2
=
self
.
loss2
(
entity_logit
.
reshape
(
-
1
,
entity_logit
.
shape
[
-
1
]),
entity_labels
.
flatten
())
return
{
'loss'
:
loss1
+
loss2
,
'loss1'
:
loss1
,
'loss2'
:
loss2
}
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印loss1和loss2
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X1
,
Y1
,
Z1
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
(
token_ids
,
entity_ids
),
(
label
,
entity_labels
)
in
tqdm
(
data
):
scores
,
entity_pred
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
# 一阶段指标: token粒度
attention_mask
=
label
.
gt
(
0
)
X1
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y1
+=
scores
.
gt
(
0
).
sum
().
item
()
Z1
+=
label
.
gt
(
0
).
sum
().
item
()
# 二阶段指标:entity粒度
entity_true
=
trans_entity2tuple
(
entity_ids
,
entity_labels
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X1
/
(
Y1
+
Z1
),
X1
/
Y1
,
X1
/
Z1
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
entity_ids
,
entity_labels
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
entity_true
=
set
()
for
i
,
one_sample
in
enumerate
(
entity_ids
):
for
j
,
item
in
enumerate
(
one_sample
):
if
item
[
0
].
item
()
*
item
[
1
].
item
()
!=
0
:
entity_true
.
add
((
i
,
item
[
0
].
item
(),
item
[
1
].
item
(),
entity_labels
[
i
,
j
].
item
()))
return
entity_true
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-1阶段] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-2阶段] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
time
import
os
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
def
acc
(
y_pred
,
y_true
):
y_pred
=
y_pred
[
0
]
y_pred
=
torch
.
argmax
(
y_pred
,
dim
=-
1
)
acc
=
torch
.
sum
(
y_pred
.
eq
(
y_true
)).
item
()
/
y_true
.
numel
()
return
{
'acc'
:
acc
}
# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
acc
)
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
logs
[
"f1"
]
=
f2
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw
=
open
(
os
.
path
.
join
(
'log/'
,
f
'time.txt'
),
'a'
,
encoding
=
'utf-8'
)
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'Start Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'End Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
time_fw
.
flush
()
time_fw
.
close
()
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_add_posseg.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 增加词性作为额外的embedding
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.30; entity_level: 96.09
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
import
jieba.posseg
as
psg
from
collections
import
Counter
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
psg_map
=
{
v
:
i
+
1
for
i
,
v
in
enumerate
([
'a'
,
'ad'
,
'ag'
,
'an'
,
'b'
,
'c'
,
'd'
,
'df'
,
'dg'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'mg'
,
'mq'
,
'n'
,
'ng'
,
'nr'
,
'nrfg'
,
'nrt'
,
'ns'
,
'nt'
,
'nz'
,
'o'
,
'p'
,
'q'
,
'r'
,
'rg'
,
'rr'
,
'rz'
,
's'
,
't'
,
'tg'
,
'u'
,
'ud'
,
'ug'
,
'uj'
,
'ul'
,
'uv'
,
'uz'
,
'v'
,
'vd'
,
'vg'
,
'vi'
,
'vn'
,
'vq'
,
'x'
,
'y'
,
'z'
,
'zg'
])}
def
collate_fn
(
batch
):
batch_token_ids
,
batch_psg_ids
,
batch_labels
=
[],
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
# 第i个token在原始text中的区间
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
# 处理词性输入
seg
=
[(
i
,
p
)
for
word
,
p
in
psg
.
cut
(
d
[
0
])
for
i
in
word
]
seg_word
,
seg_p
=
zip
(
*
seg
)
psg_ids
=
np
.
zeros
(
len
(
token_ids
))
for
i
,
j
in
enumerate
(
mapping
):
if
j
:
start
,
end
=
j
[
0
],
j
[
-
1
]
# token在原始text的首尾位置
token_new
=
(
''
.
join
(
seg_word
[
start
:
end
+
1
])).
lower
()
assert
tokens
[
i
]
==
token_new
,
f
"
{
tokens
[
i
]
}
->
{
token_new
}
"
if
start
==
end
:
psg_ids
[
i
]
=
psg_map
.
get
(
seg_p
[
start
],
0
)
# 不在字典里给0
else
:
psg_ids
[
i
]
=
psg_map
.
get
(
Counter
(
seg_p
[
start
:
end
+
1
]).
most_common
(
1
)[
0
][
0
],
0
)
# 取众数
batch_psg_ids
.
append
(
psg_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_psg_ids
=
torch
.
tensor
(
sequence_padding
(
batch_psg_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_psg_ids
],
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
layer_add_embs
=
nn
.
Embedding
(
len
(
psg_map
)
+
1
,
768
)
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
,
layer_add_embs
=
layer_add_embs
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
,
psg_ids
):
sequence_output
=
self
.
bert
([
token_ids
,
psg_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
,
psg_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
,
psg_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [bts, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
(
token_ids
,
psg_ids
),
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
,
psg_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_freeze.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 测试两种方案,一种是用数据集来生成crf权重,第二种是来初始化
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 初始化: [valid_f1] token_level: 97.35; entity_level: 96.42
# 固定化: [valid_f1] token_level: 96.92; entity_level: 95.42
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
valid_data
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 根据训练数据生成权重
transition
=
np
.
zeros
((
len
(
categories
),
len
(
categories
)))
start_transition
=
np
.
zeros
(
len
(
categories
))
end_transition
=
np
.
zeros
(
len
(
categories
))
for
d
in
tqdm
(
train_data
,
desc
=
'Generate init_trasitions'
):
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
for
i
in
range
(
len
(
labels
)
-
1
):
transition
[
int
(
labels
[
i
]),
int
(
labels
[
i
+
1
])]
+=
1
start_transition
[
int
(
labels
[
0
])]
+=
1
# start转移到标签
end_transition
[
int
(
labels
[
-
1
])]
+=
1
# 标签转移到end
max_v
=
np
.
max
([
np
.
max
(
transition
),
np
.
max
(
start_transition
),
np
.
max
(
end_transition
)])
min_v
=
np
.
min
([
np
.
min
(
transition
),
np
.
min
(
start_transition
),
np
.
min
(
end_transition
)])
transition
=
(
transition
-
min_v
)
/
(
max_v
-
min_v
)
start_transition
=
(
start_transition
-
min_v
)
/
(
max_v
-
min_v
)
end_transition
=
(
end_transition
-
min_v
)
/
(
max_v
-
min_v
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
crf
=
CRF
(
len
(
categories
),
init_transitions
=
[
transition
,
start_transition
,
end_transition
],
freeze
=
True
)
# 控制是否初始化,是否参加训练
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_crf_inference.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
time
import
os
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'/workspace/bert-base-chinese/config.json'
checkpoint_path
=
'/workspace/bert-base-chinese/pytorch_model.bin'
dict_path
=
'/workspace/bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'/workspace/bert-base-chinese/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/workspace/bert-base-chinese/china-people-daily-ner-corpus/example.test'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
def
acc
(
y_pred
,
y_true
):
y_pred
=
y_pred
[
0
]
y_pred
=
torch
.
argmax
(
y_pred
,
dim
=-
1
)
acc
=
torch
.
sum
(
y_pred
.
eq
(
y_true
)).
item
()
/
y_true
.
numel
()
return
{
'acc'
:
acc
}
# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
acc
)
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
model
.
save_weights
(
'best_model.pt'
)
logs
[
"f1"
]
=
f2
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
# time_fw为存储时间日志的文件对象,文件绝对路径为'log/time/time.txt'
time_fw
=
open
(
os
.
path
.
join
(
'log/'
,
f
'time.txt'
),
'a'
,
encoding
=
'utf-8'
)
# time_fw写入程序开始执行的时间
time_fw
.
write
(
'Start Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
#evaluator = Evaluator()
#model.fit(train_dataloader, epochs=7, steps_per_epoch=None, callbacks=[evaluator])
# time_fw写入程序开始执行的时间
model
.
load_weights
(
"best_model.pt"
)
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
print
(
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
)
time_fw
.
write
(
'End Time: {:.6f}
\n
'
.
format
(
time
.
time
()))
time_fw
.
flush
()
time_fw
.
close
()
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_efficient_global_pointer.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# efficient_global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 96.55
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
EfficientGlobalPointer
maxlen
=
256
batch_size
=
16
categories_label2id
=
{
"LOC"
:
0
,
"ORG"
:
1
,
"PER"
:
2
}
categories_id2label
=
dict
((
value
,
key
)
for
key
,
value
in
categories_label2id
.
items
())
ner_vocab_size
=
len
(
categories_label2id
)
ner_head_size
=
64
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
data
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
text
,
label
=
''
,
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
text
+=
char
if
flag
[
0
]
==
'B'
:
label
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
label
[
-
1
][
1
]
=
i
data
.
append
((
text
,
label
))
# label为[[start, end, entity], ...]
return
data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
i
,
(
text
,
text_labels
)
in
enumerate
(
batch
):
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
((
len
(
categories_label2id
),
maxlen
,
maxlen
))
for
start
,
end
,
label
in
text_labels
:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
label
=
categories_label2id
[
label
]
labels
[
label
,
start
,
end
]
=
1
batch_token_ids
.
append
(
token_ids
)
# 前面已经限制了长度
batch_labels
.
append
(
labels
[:,
:
len
(
token_ids
),
:
len
(
token_ids
)])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
,
seq_dims
=
3
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
global_pointer
=
EfficientGlobalPointer
(
hidden_size
=
768
,
heads
=
ner_vocab_size
,
head_size
=
ner_head_size
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
logit
=
self
.
global_pointer
(
sequence_output
,
token_ids
.
gt
(
0
).
long
())
return
logit
model
=
Model
().
to
(
device
)
class
MyLoss
(
MultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_pred
,
y_true
):
y_true
=
y_true
.
view
(
y_true
.
shape
[
0
]
*
y_true
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
y_pred
=
y_pred
.
view
(
y_pred
.
shape
[
0
]
*
y_pred
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
,
threshold
=
0.5
):
X
,
Y
,
Z
,
threshold
=
1e-10
,
1e-10
,
1e-10
,
0
for
x_true
,
label
in
data
:
scores
=
model
.
predict
(
x_true
)
for
i
,
score
in
enumerate
(
scores
):
R
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
score
.
cpu
()
>
threshold
)):
R
.
add
((
start
,
end
,
categories_id2label
[
l
]))
T
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
label
[
i
].
cpu
()
>
threshold
)):
T
.
add
((
start
,
end
,
categories_id2label
[
l
]))
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
bert/bert4torch_cmcc/examples/sequence_labeling/task_sequence_labeling_ner_global_pointer.py
0 → 100644
View file @
c007ba1a
#! -*- coding:utf-8 -*-
# global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 95.66
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
GlobalPointer
import
random
import
os
maxlen
=
256
batch_size
=
16
categories_label2id
=
{
"LOC"
:
0
,
"ORG"
:
1
,
"PER"
:
2
}
categories_id2label
=
dict
((
value
,
key
)
for
key
,
value
in
categories_label2id
.
items
())
ner_vocab_size
=
len
(
categories_label2id
)
ner_head_size
=
64
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
data
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
text
,
label
=
''
,
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
text
+=
char
if
flag
[
0
]
==
'B'
:
label
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
label
[
-
1
][
1
]
=
i
data
.
append
((
text
,
label
))
# label为[[start, end, entity], ...]
return
data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
i
,
(
text
,
text_labels
)
in
enumerate
(
batch
):
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
((
len
(
categories_label2id
),
maxlen
,
maxlen
))
for
start
,
end
,
label
in
text_labels
:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
label
=
categories_label2id
[
label
]
labels
[
label
,
start
,
end
]
=
1
batch_token_ids
.
append
(
token_ids
)
# 前面已经限制了长度
batch_labels
.
append
(
labels
[:,
:
len
(
token_ids
),
:
len
(
token_ids
)])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
,
seq_dims
=
3
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
global_pointer
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
ner_vocab_size
,
head_size
=
ner_head_size
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
logit
=
self
.
global_pointer
(
sequence_output
,
token_ids
.
gt
(
0
).
long
())
return
logit
model
=
Model
().
to
(
device
)
class
MyLoss
(
MultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_pred
,
y_true
):
y_true
=
y_true
.
view
(
y_true
.
shape
[
0
]
*
y_true
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
y_pred
=
y_pred
.
view
(
y_pred
.
shape
[
0
]
*
y_pred
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
,
threshold
=
0
):
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
for
x_true
,
label
in
data
:
scores
=
model
.
predict
(
x_true
)
for
i
,
score
in
enumerate
(
scores
):
R
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
score
.
cpu
()
>
threshold
)):
R
.
add
((
start
,
end
,
categories_id2label
[
l
]))
T
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
label
[
i
].
cpu
()
>
threshold
)):
T
.
add
((
start
,
end
,
categories_id2label
[
l
]))
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
Prev
1
…
3
4
5
6
7
8
9
10
11
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment