Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
bert4torch_pytorch
Commits
66a1d0d0
Commit
66a1d0d0
authored
Aug 22, 2023
by
yangzhong
Browse files
提交初版bert4torch project
parents
Pipeline
#519
canceled with stages
Changes
160
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3689 additions
and
0 deletions
+3689
-0
examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
+164
-0
examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
+161
-0
examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
+172
-0
examples/seq2seq/task_seq2seq_simbert.py
examples/seq2seq/task_seq2seq_simbert.py
+253
-0
examples/sequence_labeling/crf.py
examples/sequence_labeling/crf.py
+182
-0
examples/sequence_labeling/crf_ddp.py
examples/sequence_labeling/crf_ddp.py
+198
-0
examples/sequence_labeling/crf_m.py
examples/sequence_labeling/crf_m.py
+201
-0
examples/sequence_labeling/multi_train.sh
examples/sequence_labeling/multi_train.sh
+16
-0
examples/sequence_labeling/single_train.sh
examples/sequence_labeling/single_train.sh
+3
-0
examples/sequence_labeling/t.py
examples/sequence_labeling/t.py
+182
-0
examples/sequence_labeling/task_sequence_labeling_ner_cascade_crf.py
...quence_labeling/task_sequence_labeling_ner_cascade_crf.py
+231
-0
examples/sequence_labeling/task_sequence_labeling_ner_crf.py
examples/sequence_labeling/task_sequence_labeling_ner_crf.py
+181
-0
examples/sequence_labeling/task_sequence_labeling_ner_crf_add_posseg.py
...nce_labeling/task_sequence_labeling_ner_crf_add_posseg.py
+199
-0
examples/sequence_labeling/task_sequence_labeling_ner_crf_freeze.py
...equence_labeling/task_sequence_labeling_ner_crf_freeze.py
+209
-0
examples/sequence_labeling/task_sequence_labeling_ner_efficient_global_pointer.py
...ng/task_sequence_labeling_ner_efficient_global_pointer.py
+149
-0
examples/sequence_labeling/task_sequence_labeling_ner_global_pointer.py
...nce_labeling/task_sequence_labeling_ner_global_pointer.py
+151
-0
examples/sequence_labeling/task_sequence_labeling_ner_mrc.py
examples/sequence_labeling/task_sequence_labeling_ner_mrc.py
+222
-0
examples/sequence_labeling/task_sequence_labeling_ner_span.py
...ples/sequence_labeling/task_sequence_labeling_ner_span.py
+199
-0
examples/sequence_labeling/task_sequence_labeling_ner_tplinker_plus.py
...ence_labeling/task_sequence_labeling_ner_tplinker_plus.py
+152
-0
examples/sequence_labeling/uie/convert.py
examples/sequence_labeling/uie/convert.py
+464
-0
No files found.
examples/seq2seq/task_seq2seq_autotitle_csl_t5_pegasus.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 微调T5 PEGASUS做Seq2Seq任务, 使用到是BertTokenizer
# 介绍链接:https://kexue.fm/archives/8209
# 权重转换脚本: https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_t5_pegasus.py
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
import
jieba
jieba
.
initialize
()
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
valid_len
=
1000
# bert配置
pretrain_model
=
'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
config_path
=
pretrain_model
+
'config.json'
checkpoint_path
=
pretrain_model
+
'pytorch_model.bin'
dict_path
=
pretrain_model
+
'vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
,
pre_tokenize
=
lambda
s
:
jieba
.
cut
(
s
,
HMM
=
False
)
)
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
([
0
]
+
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
'mt5.1.1'
,
segment_vocab_size
=
0
,
attention_scale
=
False
,
is_dropout
=
True
,
tie_emb_prj_weight
=
False
,
# 独立权重
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
# inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
([
int
(
i
)
for
i
in
output_ids
.
cpu
().
numpy
()])
autotitle
=
AutoTitle
(
start_id
=
0
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
valid_len
])
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
just_show
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/seq2seq/task_seq2seq_autotitle_csl_uer_t5.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# 微调uer版T5做Seq2Seq任务
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
seed_everything
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
import
json
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
max_c_len
=
256
max_t_len
=
32
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
valid_len
=
1000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:content:[CLS]文章[SEP] tgt: [CLS]标题[SEP]
"""
batch_content_ids
,
batch_titile_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
content
,
maxlen
=
max_c_len
)
batch_content_ids
.
append
(
token_ids
)
token_ids
,
_
=
tokenizer
.
encode
(
title
,
maxlen
=
max_t_len
)
batch_titile_ids
.
append
(
token_ids
)
batch_content_ids
=
torch
.
tensor
(
sequence_padding
(
batch_content_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_titile_ids
=
torch
.
tensor
(
sequence_padding
(
batch_titile_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[[
batch_content_ids
],
[
batch_titile_ids
[:,
:
-
1
]]],
batch_titile_ids
[:,
1
:].
flatten
()
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
model
=
't5.1.0'
,
segment_vocab_size
=
0
,
attention_scale
=
False
,
is_dropout
=
True
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
y_true
):
_
,
_
,
y_pred
=
outputs
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-4
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
return
model
.
decoder
.
predict
([
output_ids
]
+
inputs
)[
-
1
][:,
-
1
,
:]
# 保留最后一位
def
generate
(
self
,
text
,
topk
=
1
):
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
token_ids
=
torch
.
tensor
([
token_ids
],
device
=
device
)
encoder_output
=
model
.
encoder
.
predict
([
token_ids
])
output_ids
=
self
.
beam_search
(
encoder_output
,
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
tokenizer
.
_token_start_id
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
max_t_len
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
valid_len
])
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/seq2seq/task_seq2seq_autotitle_csl_unilm.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# bert做Seq2Seq任务,采用UNILM方案
# 介绍链接:https://kexue.fm/archives/6933
# 数据集:https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
import
json
,
os
from
bert4torch.models
import
build_transformer_model
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
from
bert4torch.snippets
import
AutoRegressiveDecoder
,
Callback
,
ListDataset
from
tqdm
import
tqdm
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
,
Dataset
from
rouge
import
Rouge
# pip install rouge
from
nltk.translate.bleu_score
import
sentence_bleu
,
SmoothingFunction
# 基本参数
maxlen
=
256
batch_size
=
16
epochs
=
50
steps_per_epoch
=
None
valid_len
=
1000
# bert配置
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""加载数据
单条格式:(标题, 正文)
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)
title
,
content
=
l
[
'title'
],
l
[
'abst'
]
D
.
append
((
title
,
content
))
return
D
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
"""单条样本格式:[CLS]文章[SEP]标题[SEP]
"""
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
title
,
content
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
content
,
title
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_train.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataset
=
MyDataset
(
'F:/Projects/data/corpus/seq2seq/summary/csl_title_public/csl_title_dev.json'
)
model
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_mlm
=
True
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
,
# 只保留keep_tokens中的字,精简原字表
).
to
(
device
)
class
CrossEntropyLoss
(
nn
.
CrossEntropyLoss
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
outputs
,
target
):
'''
y_pred: [btz, seq_len, hdsz]
targets: y_true, y_segment
'''
_
,
y_pred
=
outputs
y_true
,
y_mask
=
target
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# segment_ids,刚好指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
CrossEntropyLoss
(
ignore_index
=
0
),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
2e-5
))
class
AutoTitle
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
default_rtype
=
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
_
,
y_pred
=
model
.
predict
([
token_ids
,
segment_ids
])
return
y_pred
[:,
-
1
,
:]
def
generate
(
self
,
text
,
topk
=
1
,
topp
=
0.95
):
max_c_len
=
maxlen
-
self
.
maxlen
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
max_c_len
)
output_ids
=
self
.
beam_search
([
token_ids
,
segment_ids
],
topk
=
topk
)
# 基于beam search
return
tokenizer
.
decode
(
output_ids
.
cpu
().
numpy
())
autotitle
=
AutoTitle
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
32
,
device
=
device
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
rouge
=
Rouge
()
self
.
smooth
=
SmoothingFunction
().
method1
self
.
best_bleu
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
just_show
()
metrics
=
self
.
evaluate
(
valid_dataset
.
data
[:
valid_len
])
# 评测模型
if
metrics
[
'bleu'
]
>
self
.
best_bleu
:
self
.
best_bleu
=
metrics
[
'bleu'
]
# model.save_weights('./best_model.pt') # 保存模型
metrics
[
'best_bleu'
]
=
self
.
best_bleu
print
(
'valid_data:'
,
metrics
)
def
evaluate
(
self
,
data
,
topk
=
1
):
total
=
0
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
0
,
0
,
0
,
0
for
title
,
content
in
tqdm
(
data
):
total
+=
1
title
=
' '
.
join
(
title
).
lower
()
pred_title
=
' '
.
join
(
autotitle
.
generate
(
content
,
topk
)).
lower
()
if
pred_title
.
strip
():
scores
=
self
.
rouge
.
get_scores
(
hyps
=
pred_title
,
refs
=
title
)
rouge_1
+=
scores
[
0
][
'rouge-1'
][
'f'
]
rouge_2
+=
scores
[
0
][
'rouge-2'
][
'f'
]
rouge_l
+=
scores
[
0
][
'rouge-l'
][
'f'
]
bleu
+=
sentence_bleu
(
references
=
[
title
.
split
(
' '
)],
hypothesis
=
pred_title
.
split
(
' '
),
smoothing_function
=
self
.
smooth
)
rouge_1
,
rouge_2
,
rouge_l
,
bleu
=
rouge_1
/
total
,
rouge_2
/
total
,
rouge_l
/
total
,
bleu
/
total
return
{
'rouge-1'
:
rouge_1
,
'rouge-2'
:
rouge_2
,
'rouge-l'
:
rouge_l
,
'bleu'
:
bleu
}
def
just_show
():
s1
=
u
'抽象了一种基于中心的战术应用场景与业务,并将网络编码技术应用于此类场景的实时数据多播业务中。在分析基于中心网络与Many-to-all业务模式特性的基础上,提出了仅在中心节点进行编码操作的传输策略以及相应的贪心算法。分析了网络编码多播策略的理论增益上界,仿真试验表明该贪心算法能够获得与理论相近的性能增益。最后的分析与仿真试验表明,在这种有中心网络的实时数据多播应用中,所提出的多播策略的实时性能要明显优于传统传输策略。'
s2
=
u
'普适计算环境中未知移动节点的位置信息是定位服务要解决的关键技术。在普适计算二维空间定位过程中,通过对三角形定位单元区域的误差分析,提出了定位单元布局(LUD)定理。在此基础上,对多个定位单元布局进行了研究,定义了一个新的描述定位单元中定位参考点覆盖效能的物理量——覆盖基,提出了在误差最小情况下定位单元布局的覆盖基定理。仿真实验表明定位单元布局定理能更好地满足对普适终端实时定位的需求,且具有较高的精度和最大覆盖效能。'
for
s
in
[
s1
,
s2
]:
print
(
u
'生成标题:'
,
autotitle
.
generate
(
s
))
if
__name__
==
'__main__'
:
just_show
()
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
evaluator
]
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/seq2seq/task_seq2seq_simbert.py
0 → 100644
View file @
66a1d0d0
#! -*- coding: utf-8 -*-
# SimBERT预训练代码,也可用于微调,微调方式用其他方式比如sentence_bert的可能更好
# 官方项目:https://github.com/ZhuiyiTechnology/simbert
import
json
import
numpy
as
np
import
torch
from
torch
import
nn
,
optim
from
torch.utils.data
import
DataLoader
import
torch.nn.functional
as
F
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
ListDataset
,
text_segmentate
,
AutoRegressiveDecoder
,
Callback
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
,
load_vocab
# 基本信息
maxlen
=
32
batch_size
=
32
# 这里加载的是simbert权重,在此基础上用自己的数据继续pretrain/finetune
# 自己从头预训练也可以直接加载bert/roberta等checkpoint
config_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 加载并精简词表,建立分词器
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
simplified
=
True
,
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
"""读取语料,每行一个json
示例:{"text": "懂英语的来!", "synonyms": ["懂英语的来!!!", "懂英语的来", "一句英语翻译 懂英语的来"]}
"""
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
D
.
append
(
json
.
loads
(
l
))
return
D
def
truncate
(
text
):
"""截断句子
"""
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
return
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
)[
0
]
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
=
[],
[]
for
d
in
batch
:
text
,
synonyms
=
d
[
'text'
],
d
[
'synonyms'
]
synonyms
=
[
text
]
+
synonyms
np
.
random
.
shuffle
(
synonyms
)
text
,
synonym
=
synonyms
[:
2
]
text
,
synonym
=
truncate
(
text
),
truncate
(
synonym
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
synonym
,
maxlen
=
maxlen
*
2
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
token_ids
,
segment_ids
=
tokenizer
.
encode
(
synonym
,
text
,
maxlen
=
maxlen
*
2
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_token_ids
,
batch_segment_ids
]
train_dataloader
=
DataLoader
(
MyDataset
(
'../datasets/data_similarity.json'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 建立加载模型
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
'linear'
,
with_mlm
=
'linear'
,
application
=
'unilm'
,
keep_tokens
=
keep_tokens
)
self
.
pool_method
=
pool_method
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_state
,
pool_cls
,
seq_logit
=
self
.
bert
([
token_ids
,
segment_ids
])
sen_emb
=
get_pool_emb
(
hidden_state
,
pool_cls
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
return
seq_logit
,
sen_emb
model
=
Model
(
pool_method
=
'cls'
).
to
(
device
)
class
TotalLoss
(
nn
.
Module
):
"""loss分两部分,一是seq2seq的交叉熵,二是相似度的交叉熵。
"""
def
forward
(
self
,
outputs
,
target
):
seq_logit
,
sen_emb
=
outputs
seq_label
,
seq_mask
=
target
seq2seq_loss
=
self
.
compute_loss_of_seq2seq
(
seq_logit
,
seq_label
,
seq_mask
)
similarity_loss
=
self
.
compute_loss_of_similarity
(
sen_emb
)
return
{
'loss'
:
seq2seq_loss
+
similarity_loss
,
'seq2seq_loss'
:
seq2seq_loss
,
'similarity_loss'
:
similarity_loss
}
def
compute_loss_of_seq2seq
(
self
,
y_pred
,
y_true
,
y_mask
):
'''
y_pred: [btz, seq_len, hdsz]
y_true: [btz, seq_len]
y_mask: [btz, seq_len]
'''
y_true
=
y_true
[:,
1
:]
# 目标token_ids
y_mask
=
y_mask
[:,
1
:]
# 指示了要预测的部分
y_pred
=
y_pred
[:,
:
-
1
,
:]
# 预测序列,错开一位
y_pred
=
y_pred
.
reshape
(
-
1
,
y_pred
.
shape
[
-
1
])
y_true
=
(
y_true
*
y_mask
).
flatten
()
return
F
.
cross_entropy
(
y_pred
,
y_true
,
ignore_index
=
0
)
def
compute_loss_of_similarity
(
self
,
y_pred
):
y_true
=
self
.
get_labels_of_similarity
(
y_pred
)
# 构建标签
y_pred
=
F
.
normalize
(
y_pred
,
p
=
2
,
dim
=-
1
)
# 句向量归一化
similarities
=
torch
.
matmul
(
y_pred
,
y_pred
.
T
)
# 相似度矩阵
similarities
=
similarities
-
torch
.
eye
(
y_pred
.
shape
[
0
],
device
=
device
)
*
1e12
# 排除对角线
similarities
=
similarities
*
30
# scale
loss
=
F
.
cross_entropy
(
similarities
,
y_true
)
return
loss
def
get_labels_of_similarity
(
self
,
y_pred
):
idxs
=
torch
.
arange
(
0
,
y_pred
.
shape
[
0
],
device
=
device
)
idxs_1
=
idxs
[
None
,
:]
idxs_2
=
(
idxs
+
1
-
idxs
%
2
*
2
)[:,
None
]
labels
=
idxs_1
.
eq
(
idxs_2
).
float
()
return
labels
model
.
compile
(
loss
=
TotalLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
1e-5
),
metrics
=
[
'seq2seq_loss'
,
'similarity_loss'
])
class
SynonymsGenerator
(
AutoRegressiveDecoder
):
"""seq2seq解码器
"""
@
AutoRegressiveDecoder
.
wraps
(
'logits'
)
def
predict
(
self
,
inputs
,
output_ids
,
states
):
token_ids
,
segment_ids
=
inputs
token_ids
=
torch
.
cat
([
token_ids
,
output_ids
],
1
)
segment_ids
=
torch
.
cat
([
segment_ids
,
torch
.
ones_like
(
output_ids
,
device
=
device
)],
1
)
seq_logit
,
_
=
model
.
predict
([
token_ids
,
segment_ids
])
return
seq_logit
[:,
-
1
,
:]
def
generate
(
self
,
text
,
n
=
1
,
topk
=
5
):
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
output_ids
=
self
.
random_sample
([
token_ids
,
segment_ids
],
n
,
topk
)
# 基于随机采样
return
[
tokenizer
.
decode
(
ids
.
cpu
().
numpy
())
for
ids
in
output_ids
]
synonyms_generator
=
SynonymsGenerator
(
start_id
=
None
,
end_id
=
tokenizer
.
_token_end_id
,
maxlen
=
maxlen
,
device
=
device
)
def
cal_sen_emb
(
text_list
):
'''输入text的list,计算sentence的embedding
'''
X
,
S
=
[],
[]
for
t
in
text_list
:
x
,
s
=
tokenizer
.
encode
(
t
)
X
.
append
(
x
)
S
.
append
(
s
)
X
=
torch
.
tensor
(
sequence_padding
(
X
),
dtype
=
torch
.
long
,
device
=
device
)
S
=
torch
.
tensor
(
sequence_padding
(
S
),
dtype
=
torch
.
long
,
device
=
device
)
_
,
Z
=
model
.
predict
([
X
,
S
])
return
Z
def
gen_synonyms
(
text
,
n
=
100
,
k
=
20
):
""""含义: 产生sent的n个相似句,然后返回最相似的k个。
做法:用seq2seq生成,并用encoder算相似度并排序。
效果:
>>> gen_synonyms(u'微信和支付宝哪个好?')
[
u'微信和支付宝,哪个好?',
u'微信和支付宝哪个好',
u'支付宝和微信哪个好',
u'支付宝和微信哪个好啊',
u'微信和支付宝那个好用?',
u'微信和支付宝哪个好用',
u'支付宝和微信那个更好',
u'支付宝和微信哪个好用',
u'微信和支付宝用起来哪个好?',
u'微信和支付宝选哪个好',
]
"""
r
=
synonyms_generator
.
generate
(
text
,
n
)
r
=
[
i
for
i
in
set
(
r
)
if
i
!=
text
]
# 不和原文相同
r
=
[
text
]
+
r
Z
=
cal_sen_emb
(
r
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
argsort
=
torch
.
matmul
(
Z
[
1
:],
-
Z
[
0
]).
argsort
()
return
[
r
[
i
+
1
]
for
i
in
argsort
[:
k
]]
def
just_show
(
some_samples
):
"""随机观察一些样本的效果
"""
S
=
[
np
.
random
.
choice
(
some_samples
)
for
_
in
range
(
3
)]
for
s
in
S
:
try
:
print
(
u
'原句子:%s'
%
s
)
print
(
u
'同义句子:'
,
gen_synonyms
(
s
,
10
,
10
))
print
()
except
:
pass
class
Evaluator
(
Callback
):
"""评估模型
"""
def
__init__
(
self
):
self
.
lowest
=
1e10
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# 保存最优
if
logs
[
'loss'
]
<=
self
.
lowest
:
self
.
lowest
=
logs
[
'loss'
]
# model.save_weights('./best_model.pt')
# 演示效果
just_show
([
'微信和支付宝拿个好用?'
,
'微信和支付宝,哪个好?'
,
'微信和支付宝哪个好'
,
'支付宝和微信哪个好'
,
'支付宝和微信哪个好啊'
,
'微信和支付宝那个好用?'
,
'微信和支付宝哪个好用'
,
'支付宝和微信那个更好'
,
'支付宝和微信哪个好用'
,
'微信和支付宝用起来哪个好?'
,
'微信和支付宝选哪个好'
])
if
__name__
==
'__main__'
:
choice
=
'similarity'
# train generate similarity
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
50
,
steps_per_epoch
=
200
,
callbacks
=
[
evaluator
])
elif
choice
==
'generate'
:
print
(
gen_synonyms
(
'我想去北京玩玩可以吗'
,
10
,
10
))
elif
choice
==
'similarity'
:
target_text
=
'我想去首都北京玩玩'
text_list
=
[
'我想去北京玩'
,
'北京有啥好玩的吗?我想去看看'
,
'好渴望去北京游玩啊'
]
Z
=
cal_sen_emb
([
target_text
]
+
text_list
)
Z
/=
(
Z
**
2
).
sum
(
dim
=
1
,
keepdims
=
True
)
**
0.5
similarity
=
torch
.
matmul
(
Z
[
1
:],
Z
[
0
])
for
i
,
line
in
enumerate
(
text_list
):
print
(
f
'cos_sim:
{
similarity
[
i
].
item
():.
4
f
}
, tgt_text: "
{
target_text
}
", cal_text: "
{
line
}
"'
)
else
:
model
.
load_weights
(
'./best_model.pt'
)
examples/sequence_labeling/crf.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
64
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'/bert4torch/datasets/bert-base-chinese/config.json'
checkpoint_path
=
'/bert4torch/datasets/bert-base-chinese/pytorch_model.bin'
dict_path
=
'/bert4torch/datasets/bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
# fp32
# model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), use_amp=True) # fp16
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/crf_ddp.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
bert4torch.models
import
BaseModelDDP
import
os
maxlen
=
256
batch_size
=
64
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path
=
'/bert4torch/datasets/bert-base-chinese/config.json'
checkpoint_path
=
'/bert4torch/datasets/bert-base-chinese/pytorch_model.bin'
dict_path
=
'/bert4torch/datasets/bert-base-chinese/vocab.txt'
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
local_rank
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
print
(
"local_rank "
,
local_rank
)
torch
.
cuda
.
set_device
(
local_rank
)
device
=
torch
.
device
(
"cuda"
,
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
#train_dataloader = DataLoader(MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
train_dataset
=
MyDataset
(
'/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'
)
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
train_dataset
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
sampler
=
train_sampler
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/bert4torch/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model
=
BaseModelDDP
(
model
,
master_rank
=
0
,
device_ids
=
[
local_rank
],
output_device
=
local_rank
,
find_unused_parameters
=
False
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
module
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
# fp32
# 定义使用的loss和optimizer,这里支持自定义
# model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), use_amp=True) # fp16
#compile(self, loss, optimizer, scheduler=None, max_grad_norm=None, use_amp=False, metrics=None, adversarial_train={'name': ''}):
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
module
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
#print("#### scores: ", scores)
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/crf_m.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
import
torch.distributed
as
dist
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
import
os
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path
=
'/datasets/bert-base-chinese/config.json'
checkpoint_path
=
'/datasets/bert-base-chinese/pytorch_model.bin'
dict_path
=
'/datasets/bert-base-chinese/vocab.txt'
local_rank
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
print
(
"local_rank "
,
local_rank
)
torch
.
cuda
.
set_device
(
local_rank
)
device
=
torch
.
device
(
"cuda"
,
local_rank
)
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything
(
42
)
# DDP init
dist
.
init_process_group
(
"nccl"
,
init_method
=
'env://'
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataset
=
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'
)
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
train_dataset
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
sampler
=
train_sampler
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
module
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
# DDP
model
=
DDP
(
model
,
device_ids
=
[
local_rank
],
output_device
=
local_rank
)
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
module
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
#model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
model
.
module
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/multi_train.sh
0 → 100644
View file @
66a1d0d0
NUM
=
$((
$(
rocm-smi |sed
-n
'/DCU/,/===/ p'
|wc
-l
)
-
2
))
START
=
0
if
[
$#
-gt
0
]
;
then
##DCU Number
NUM
=
$1
fi
if
[
$#
-gt
1
]
;
then
##The First DCU ID
START
=
$2
fi
LAST
=
$((
START+NUM-1
))
export
HIP_VISIBLE_DEVICES
=
$(
seq
-s
,
${
START
}
${
LAST
}
)
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
logfile
=
bert_base_
${
NUM
}
dcu_
`
date
+%Y%m%d%H%M%S
`
.log
python3
-m
torch.distributed.run
--nproc_per_node
=
${
NUM
}
crf_ddp.py 2>&1 |
tee
$logfile
examples/sequence_labeling/single_train.sh
0 → 100644
View file @
66a1d0d0
logfile
=
bert_base_
`
date
+%Y%m%d%H%M%S
`
.log
python3 crf.py 2>&1 |
tee
$logfile
examples/sequence_labeling/t.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
#config_path = '/datasets/bert-base-chinese/bert_config.json'
config_path
=
'/datasets/bert-base-chinese/config.json'
checkpoint_path
=
'/datasets/bert-base-chinese/pytorch_model.bin'
dict_path
=
'/datasets/bert-base-chinese/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_cascade_crf.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf 级联方法,一阶段识别BIO,二阶段识别对应的分类
# 参考博客:https://zhuanlan.zhihu.com/p/166496466
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 98.11; entity_level: 96.23
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'LOC'
,
'PER'
,
'ORG'
]
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
,
batch_entity_ids
,
batch_entity_labels
=
[],
[],
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
entity_ids
,
entity_labels
=
[],
[]
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
1
# 标记B
labels
[
start
+
1
:
end
+
1
]
=
2
# 标记I
entity_ids
.
append
([
start
,
end
])
entity_labels
.
append
(
categories
.
index
(
label
)
+
1
)
if
not
entity_ids
:
# 至少要有一个标签
entity_ids
.
append
([
0
,
0
])
# 如果没有则用0填充
entity_labels
.
append
(
0
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_entity_ids
.
append
(
entity_ids
)
batch_entity_labels
.
append
(
entity_labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数,start/end]
batch_entity_labels
=
torch
.
tensor
(
sequence_padding
(
batch_entity_labels
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数]
return
[
batch_token_ids
,
batch_entity_ids
],
[
batch_labels
,
batch_entity_labels
]
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
dense1
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
dense2
=
nn
.
Linear
(
768
,
len
(
categories
)
+
1
)
# 包含padding
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
inputs
):
# 一阶段的输出
token_ids
,
entity_ids
=
inputs
[
0
],
inputs
[
1
]
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
dense1
(
last_hidden_state
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
# 二阶段输出
btz
,
entity_count
,
_
=
entity_ids
.
shape
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
entity_ids
=
entity_ids
.
reshape
(
btz
,
-
1
,
1
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
entity_ids
).
reshape
(
btz
,
entity_count
,
-
1
,
hidden_size
)
entity_states
=
torch
.
mean
(
entity_states
,
dim
=
2
)
# 取实体首尾hidden_states的均值
entity_logit
=
self
.
dense2
(
entity_states
)
# [btz, 实体个数,实体类型数]
return
emission_score
,
attention_mask
,
entity_logit
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
# 一阶段推理
last_hidden_state
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
dense1
(
last_hidden_state
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [bts, seq_len]
# 二阶段推理
batch_entity_ids
=
[]
for
one_samp
in
best_path
:
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
if
item
.
item
()
==
1
:
# B
entity_ids
.
append
([
j
,
j
])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
(
item
.
item
()
==
2
):
# I
entity_ids
[
-
1
][
-
1
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
if
not
entity_ids
:
# 至少要有一个标签
entity_ids
.
append
([
0
,
0
])
# 如果没有则用0填充
batch_entity_ids
.
append
([
i
for
i
in
entity_ids
if
i
])
batch_entity_ids
=
torch
.
tensor
(
sequence_padding
(
batch_entity_ids
),
dtype
=
torch
.
long
,
device
=
device
)
# [btz, 实体个数,start/end]
btz
,
entity_count
,
_
=
batch_entity_ids
.
shape
hidden_size
=
last_hidden_state
.
shape
[
-
1
]
gather_index
=
batch_entity_ids
.
reshape
(
btz
,
-
1
,
1
).
repeat
(
1
,
1
,
hidden_size
)
entity_states
=
torch
.
gather
(
last_hidden_state
,
dim
=
1
,
index
=
gather_index
).
reshape
(
btz
,
entity_count
,
-
1
,
hidden_size
)
entity_states
=
torch
.
mean
(
entity_states
,
dim
=
2
)
# 取实体首尾hidden_states的均值
entity_logit
=
self
.
dense2
(
entity_states
)
# [btz, 实体个数,实体类型数]
entity_pred
=
torch
.
argmax
(
entity_logit
,
dim
=-
1
)
# [btz, 实体个数]
# 每个元素为一个三元组
entity_tulpe
=
trans_entity2tuple
(
batch_entity_ids
,
entity_pred
)
return
best_path
,
entity_tulpe
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
loss2
=
nn
.
CrossEntropyLoss
(
ignore_index
=
0
)
def
forward
(
self
,
outputs
,
labels
):
emission_score
,
attention_mask
,
entity_logit
=
outputs
seq_labels
,
entity_labels
=
labels
loss1
=
model
.
crf
(
emission_score
,
attention_mask
,
seq_labels
)
loss2
=
self
.
loss2
(
entity_logit
.
reshape
(
-
1
,
entity_logit
.
shape
[
-
1
]),
entity_labels
.
flatten
())
return
{
'loss'
:
loss1
+
loss2
,
'loss1'
:
loss1
,
'loss2'
:
loss2
}
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印loss1和loss2
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X1
,
Y1
,
Z1
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
(
token_ids
,
entity_ids
),
(
label
,
entity_labels
)
in
tqdm
(
data
):
scores
,
entity_pred
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
# 一阶段指标: token粒度
attention_mask
=
label
.
gt
(
0
)
X1
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y1
+=
scores
.
gt
(
0
).
sum
().
item
()
Z1
+=
label
.
gt
(
0
).
sum
().
item
()
# 二阶段指标:entity粒度
entity_true
=
trans_entity2tuple
(
entity_ids
,
entity_labels
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X1
/
(
Y1
+
Z1
),
X1
/
Y1
,
X1
/
Z1
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
entity_ids
,
entity_labels
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
entity_true
=
set
()
for
i
,
one_sample
in
enumerate
(
entity_ids
):
for
j
,
item
in
enumerate
(
one_sample
):
if
item
[
0
].
item
()
*
item
[
1
].
item
()
!=
0
:
entity_true
.
add
((
i
,
item
[
0
].
item
(),
item
[
1
].
item
(),
entity_labels
[
i
,
j
].
item
()))
return
entity_true
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-1阶段] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-2阶段] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_crf.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.06; entity_level: 95.90
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
# 包含首尾
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_crf_add_posseg.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 增加词性作为额外的embedding
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1] token_level: 97.30; entity_level: 96.09
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
import
jieba.posseg
as
psg
from
collections
import
Counter
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
psg_map
=
{
v
:
i
+
1
for
i
,
v
in
enumerate
([
'a'
,
'ad'
,
'ag'
,
'an'
,
'b'
,
'c'
,
'd'
,
'df'
,
'dg'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'mg'
,
'mq'
,
'n'
,
'ng'
,
'nr'
,
'nrfg'
,
'nrt'
,
'ns'
,
'nt'
,
'nz'
,
'o'
,
'p'
,
'q'
,
'r'
,
'rg'
,
'rr'
,
'rz'
,
's'
,
't'
,
'tg'
,
'u'
,
'ud'
,
'ug'
,
'uj'
,
'ul'
,
'uv'
,
'uz'
,
'v'
,
'vd'
,
'vg'
,
'vi'
,
'vn'
,
'vq'
,
'x'
,
'y'
,
'z'
,
'zg'
])}
def
collate_fn
(
batch
):
batch_token_ids
,
batch_psg_ids
,
batch_labels
=
[],
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
# 第i个token在原始text中的区间
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
# 处理词性输入
seg
=
[(
i
,
p
)
for
word
,
p
in
psg
.
cut
(
d
[
0
])
for
i
in
word
]
seg_word
,
seg_p
=
zip
(
*
seg
)
psg_ids
=
np
.
zeros
(
len
(
token_ids
))
for
i
,
j
in
enumerate
(
mapping
):
if
j
:
start
,
end
=
j
[
0
],
j
[
-
1
]
# token在原始text的首尾位置
token_new
=
(
''
.
join
(
seg_word
[
start
:
end
+
1
])).
lower
()
assert
tokens
[
i
]
==
token_new
,
f
"
{
tokens
[
i
]
}
->
{
token_new
}
"
if
start
==
end
:
psg_ids
[
i
]
=
psg_map
.
get
(
seg_p
[
start
],
0
)
# 不在字典里给0
else
:
psg_ids
[
i
]
=
psg_map
.
get
(
Counter
(
seg_p
[
start
:
end
+
1
]).
most_common
(
1
)[
0
][
0
],
0
)
# 取众数
batch_psg_ids
.
append
(
psg_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_psg_ids
=
torch
.
tensor
(
sequence_padding
(
batch_psg_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_psg_ids
],
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
layer_add_embs
=
nn
.
Embedding
(
len
(
psg_map
)
+
1
,
768
)
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
,
layer_add_embs
=
layer_add_embs
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
crf
=
CRF
(
len
(
categories
))
def
forward
(
self
,
token_ids
,
psg_ids
):
sequence_output
=
self
.
bert
([
token_ids
,
psg_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [bts, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
)
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
,
psg_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
,
psg_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [bts, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
(
token_ids
,
psg_ids
),
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
,
psg_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_crf_freeze.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别, 测试两种方案,一种是用数据集来生成crf权重,第二种是来初始化
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 初始化: [valid_f1] token_level: 97.35; entity_level: 96.42
# 固定化: [valid_f1] token_level: 96.92; entity_level: 95.42
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.layers
import
CRF
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
maxlen
=
256
batch_size
=
16
categories
=
[
'O'
,
'B-LOC'
,
'I-LOC'
,
'B-PER'
,
'I-PER'
,
'B-ORG'
,
'I-ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
(
labels
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_data
=
load_data
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
)
valid_data
=
load_data
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
)
train_dataloader
=
DataLoader
(
ListDataset
(
data
=
train_data
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
ListDataset
(
data
=
valid_data
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 根据训练数据生成权重
transition
=
np
.
zeros
((
len
(
categories
),
len
(
categories
)))
start_transition
=
np
.
zeros
(
len
(
categories
))
end_transition
=
np
.
zeros
(
len
(
categories
))
for
d
in
tqdm
(
train_data
,
desc
=
'Generate init_trasitions'
):
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
(
len
(
token_ids
))
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
labels
[
start
]
=
categories_label2id
[
'B-'
+
label
]
labels
[
start
+
1
:
end
+
1
]
=
categories_label2id
[
'I-'
+
label
]
for
i
in
range
(
len
(
labels
)
-
1
):
transition
[
int
(
labels
[
i
]),
int
(
labels
[
i
+
1
])]
+=
1
start_transition
[
int
(
labels
[
0
])]
+=
1
# start转移到标签
end_transition
[
int
(
labels
[
-
1
])]
+=
1
# 标签转移到end
max_v
=
np
.
max
([
np
.
max
(
transition
),
np
.
max
(
start_transition
),
np
.
max
(
end_transition
)])
min_v
=
np
.
min
([
np
.
min
(
transition
),
np
.
min
(
start_transition
),
np
.
min
(
end_transition
)])
transition
=
(
transition
-
min_v
)
/
(
max_v
-
min_v
)
start_transition
=
(
start_transition
-
min_v
)
/
(
max_v
-
min_v
)
end_transition
=
(
end_transition
-
min_v
)
/
(
max_v
-
min_v
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
categories
))
self
.
crf
=
CRF
(
len
(
categories
),
init_transitions
=
[
transition
,
start_transition
,
end_transition
],
freeze
=
True
)
# 控制是否初始化,是否参加训练
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
emission_score
=
self
.
fc
(
sequence_output
)
# [btz, seq_len, tag_size]
attention_mask
=
token_ids
.
gt
(
0
).
long
()
return
emission_score
,
attention_mask
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
emission_score
,
attention_mask
=
self
.
forward
(
token_ids
)
best_path
=
self
.
crf
.
decode
(
emission_score
,
attention_mask
)
# [btz, seq_len]
return
best_path
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
outputs
,
labels
):
return
model
.
crf
(
*
outputs
,
labels
)
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
1e-10
,
1e-10
,
1e-10
X2
,
Y2
,
Z2
=
1e-10
,
1e-10
,
1e-10
for
token_ids
,
label
in
tqdm
(
data
):
scores
=
model
.
predict
(
token_ids
)
# [btz, seq_len]
attention_mask
=
label
.
gt
(
0
)
# token粒度
X
+=
(
scores
.
eq
(
label
)
*
attention_mask
).
sum
().
item
()
Y
+=
scores
.
gt
(
0
).
sum
().
item
()
Z
+=
label
.
gt
(
0
).
sum
().
item
()
# entity粒度
entity_pred
=
trans_entity2tuple
(
scores
)
entity_true
=
trans_entity2tuple
(
label
)
X2
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y2
+=
len
(
entity_pred
)
Z2
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
f2
,
precision2
,
recall2
=
2
*
X2
/
(
Y2
+
Z2
),
X2
/
Y2
,
X2
/
Z2
return
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
def
trans_entity2tuple
(
scores
):
'''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
'''
batch_entity_ids
=
set
()
for
i
,
one_samp
in
enumerate
(
scores
):
entity_ids
=
[]
for
j
,
item
in
enumerate
(
one_samp
):
flag_tag
=
categories_id2label
[
item
.
item
()]
if
flag_tag
.
startswith
(
'B-'
):
# B
entity_ids
.
append
([
i
,
j
,
j
,
flag_tag
[
2
:]])
elif
len
(
entity_ids
)
==
0
:
continue
elif
(
len
(
entity_ids
[
-
1
])
>
0
)
and
flag_tag
.
startswith
(
'I-'
)
and
(
flag_tag
[
2
:]
==
entity_ids
[
-
1
][
-
1
]):
# I
entity_ids
[
-
1
][
-
2
]
=
j
elif
len
(
entity_ids
[
-
1
])
>
0
:
entity_ids
.
append
([])
for
i
in
entity_ids
:
if
i
:
batch_entity_ids
.
add
(
tuple
(
i
))
return
batch_entity_ids
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
,
f2
,
precision2
,
recall2
=
evaluate
(
valid_dataloader
)
if
f2
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f2
# model.save_weights('best_model.pt')
print
(
f
'[val-token level] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
'
)
print
(
f
'[val-entity level] f1:
{
f2
:.
5
f
}
, p:
{
precision2
:.
5
f
}
r:
{
recall2
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_efficient_global_pointer.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# efficient_global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 96.55
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
EfficientGlobalPointer
maxlen
=
256
batch_size
=
16
categories_label2id
=
{
"LOC"
:
0
,
"ORG"
:
1
,
"PER"
:
2
}
categories_id2label
=
dict
((
value
,
key
)
for
key
,
value
in
categories_label2id
.
items
())
ner_vocab_size
=
len
(
categories_label2id
)
ner_head_size
=
64
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
data
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
text
,
label
=
''
,
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
text
+=
char
if
flag
[
0
]
==
'B'
:
label
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
label
[
-
1
][
1
]
=
i
data
.
append
((
text
,
label
))
# label为[[start, end, entity], ...]
return
data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
i
,
(
text
,
text_labels
)
in
enumerate
(
batch
):
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
((
len
(
categories_label2id
),
maxlen
,
maxlen
))
for
start
,
end
,
label
in
text_labels
:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
label
=
categories_label2id
[
label
]
labels
[
label
,
start
,
end
]
=
1
batch_token_ids
.
append
(
token_ids
)
# 前面已经限制了长度
batch_labels
.
append
(
labels
[:,
:
len
(
token_ids
),
:
len
(
token_ids
)])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
,
seq_dims
=
3
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
global_pointer
=
EfficientGlobalPointer
(
hidden_size
=
768
,
heads
=
ner_vocab_size
,
head_size
=
ner_head_size
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
logit
=
self
.
global_pointer
(
sequence_output
,
token_ids
.
gt
(
0
).
long
())
return
logit
model
=
Model
().
to
(
device
)
class
MyLoss
(
MultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_pred
,
y_true
):
y_true
=
y_true
.
view
(
y_true
.
shape
[
0
]
*
y_true
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
y_pred
=
y_pred
.
view
(
y_pred
.
shape
[
0
]
*
y_pred
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
,
threshold
=
0.5
):
X
,
Y
,
Z
,
threshold
=
1e-10
,
1e-10
,
1e-10
,
0
for
x_true
,
label
in
data
:
scores
=
model
.
predict
(
x_true
)
for
i
,
score
in
enumerate
(
scores
):
R
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
score
.
cpu
()
>
threshold
)):
R
.
add
((
start
,
end
,
categories_id2label
[
l
]))
T
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
label
[
i
].
cpu
()
>
threshold
)):
T
.
add
((
start
,
end
,
categories_id2label
[
l
]))
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_global_pointer.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# global_pointer用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# 博客:https://kexue.fm/archives/8373
# [valid_f1]: 95.66
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
GlobalPointer
import
random
import
os
maxlen
=
256
batch_size
=
16
categories_label2id
=
{
"LOC"
:
0
,
"ORG"
:
1
,
"PER"
:
2
}
categories_id2label
=
dict
((
value
,
key
)
for
key
,
value
in
categories_label2id
.
items
())
ner_vocab_size
=
len
(
categories_label2id
)
ner_head_size
=
64
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
data
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
text
,
label
=
''
,
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
text
+=
char
if
flag
[
0
]
==
'B'
:
label
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
label
[
-
1
][
1
]
=
i
data
.
append
((
text
,
label
))
# label为[[start, end, entity], ...]
return
data
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
i
,
(
text
,
text_labels
)
in
enumerate
(
batch
):
tokens
=
tokenizer
.
tokenize
(
text
,
maxlen
=
maxlen
)
mapping
=
tokenizer
.
rematch
(
text
,
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
labels
=
np
.
zeros
((
len
(
categories_label2id
),
maxlen
,
maxlen
))
for
start
,
end
,
label
in
text_labels
:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
label
=
categories_label2id
[
label
]
labels
[
label
,
start
,
end
]
=
1
batch_token_ids
.
append
(
token_ids
)
# 前面已经限制了长度
batch_labels
.
append
(
labels
[:,
:
len
(
token_ids
),
:
len
(
token_ids
)])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
sequence_padding
(
batch_labels
,
seq_dims
=
3
),
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
global_pointer
=
GlobalPointer
(
hidden_size
=
768
,
heads
=
ner_vocab_size
,
head_size
=
ner_head_size
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
([
token_ids
])
# [btz, seq_len, hdsz]
logit
=
self
.
global_pointer
(
sequence_output
,
token_ids
.
gt
(
0
).
long
())
return
logit
model
=
Model
().
to
(
device
)
class
MyLoss
(
MultilabelCategoricalCrossentropy
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
def
forward
(
self
,
y_pred
,
y_true
):
y_true
=
y_true
.
view
(
y_true
.
shape
[
0
]
*
y_true
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
y_pred
=
y_pred
.
view
(
y_pred
.
shape
[
0
]
*
y_pred
.
shape
[
1
],
-
1
)
# [btz*ner_vocab_size, seq_len*seq_len]
return
super
().
forward
(
y_pred
,
y_true
)
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
,
threshold
=
0
):
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
for
x_true
,
label
in
data
:
scores
=
model
.
predict
(
x_true
)
for
i
,
score
in
enumerate
(
scores
):
R
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
score
.
cpu
()
>
threshold
)):
R
.
add
((
start
,
end
,
categories_id2label
[
l
]))
T
=
set
()
for
l
,
start
,
end
in
zip
(
*
np
.
where
(
label
[
i
].
cpu
()
>
threshold
)):
T
.
add
((
start
,
end
,
categories_id2label
[
l
]))
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_mrc.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# mrc阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 95.75
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
from
collections
import
defaultdict
max_c_len
=
224
max_q_len
=
32
batch_size
=
6
# 真实的batch_size是 batch_size * 实体类型数
categories
=
[
'LOC'
,
'PER'
,
'ORG'
]
ent2query
=
{
"LOC"
:
"找出下述句子中的地址名"
,
"PER"
:
"找出下述句子中的人名"
,
"ORG"
:
"找出下述句子中的机构名"
}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_start_labels
,
batch_end_labels
=
[],
[],
[],
[]
batch_ent_type
=
[]
for
d
in
batch
:
tokens_b
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
max_c_len
)[
1
:]
# 不保留[CLS]
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens_b
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
# 按照实体类型整理实体
label_dict
=
defaultdict
(
list
)
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
label_dict
[
label
].
append
((
start
,
end
))
# 遍历实体类型,query为tokens_a, context为tokens_b
# 样本组成:[CLS] + tokens_a + [SEP] + tokens_b + [SEP]
for
_type
in
categories
:
start_ids
=
[
0
]
*
len
(
tokens_b
)
end_ids
=
[
0
]
*
len
(
tokens_b
)
text_a
=
ent2query
[
_type
]
tokens_a
=
tokenizer
.
tokenize
(
text_a
,
maxlen
=
max_q_len
)
for
_label
in
label_dict
[
_type
]:
start_ids
[
_label
[
0
]]
=
1
end_ids
[
_label
[
1
]]
=
1
start_ids
=
[
0
]
*
len
(
tokens_a
)
+
start_ids
end_ids
=
[
0
]
*
len
(
tokens_a
)
+
end_ids
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens_a
)
+
tokenizer
.
tokens_to_ids
(
tokens_b
)
segment_ids
=
[
0
]
*
len
(
tokens_a
)
+
[
1
]
*
len
(
tokens_b
)
assert
len
(
start_ids
)
==
len
(
end_ids
)
==
len
(
token_ids
)
==
len
(
segment_ids
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_start_labels
.
append
(
start_ids
)
batch_end_labels
.
append
(
end_ids
)
batch_ent_type
.
append
(
_type
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_start_labels
=
torch
.
tensor
(
sequence_padding
(
batch_start_labels
),
dtype
=
torch
.
long
,
device
=
device
)
batch_end_labels
=
torch
.
tensor
(
sequence_padding
(
batch_end_labels
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
[
batch_segment_ids
,
batch_start_labels
,
batch_end_labels
,
batch_ent_type
]
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
)
self
.
mid_linear
=
nn
.
Sequential
(
nn
.
Linear
(
768
,
128
),
nn
.
ReLU
(),
nn
.
Dropout
(
0.1
)
)
self
.
start_fc
=
nn
.
Linear
(
128
,
2
)
self
.
end_fc
=
nn
.
Linear
(
128
,
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
sequence_output
=
self
.
bert
([
token_ids
,
segment_ids
])
# [bts, seq_len, hdsz]
seq_out
=
self
.
mid_linear
(
sequence_output
)
# [bts, seq_len, mid_dims]
start_logits
=
self
.
start_fc
(
seq_out
)
# [bts, seq_len, 2]
end_logits
=
self
.
end_fc
(
seq_out
)
# [bts, seq_len, 2]
return
start_logits
,
end_logits
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
CrossEntropyLoss
):
def
forward
(
self
,
outputs
,
labels
):
start_logits
,
end_logits
=
outputs
mask
,
start_ids
,
end_ids
=
labels
[:
3
]
start_logits
=
start_logits
.
view
(
-
1
,
2
)
end_logits
=
end_logits
.
view
(
-
1
,
2
)
# 去掉 text_a 和 padding 部分的标签,计算真实 loss
active_loss
=
mask
.
view
(
-
1
)
==
1
active_start_logits
=
start_logits
[
active_loss
]
active_end_logits
=
end_logits
[
active_loss
]
active_start_labels
=
start_ids
.
view
(
-
1
)[
active_loss
]
active_end_labels
=
end_ids
.
view
(
-
1
)[
active_loss
]
start_loss
=
super
().
forward
(
active_start_logits
,
active_start_labels
)
end_loss
=
super
().
forward
(
active_end_logits
,
active_end_labels
)
return
start_loss
+
end_loss
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
for
(
token_ids
,
segment_ids
),
labels
in
tqdm
(
data
,
desc
=
'Evaluation'
):
start_logit
,
end_logit
=
model
.
predict
([
token_ids
,
segment_ids
])
# [btz, seq_len, 2]
mask
,
start_ids
,
end_ids
,
ent_type
=
labels
# entity粒度
entity_pred
=
mrc_decode
(
start_logit
,
end_logit
,
ent_type
,
mask
)
entity_true
=
mrc_decode
(
start_ids
,
end_ids
,
ent_type
)
X
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y
+=
len
(
entity_pred
)
Z
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
# 严格解码 baseline
def
mrc_decode
(
start_preds
,
end_preds
,
ent_type
,
mask
=
None
):
'''返回实体的start, end
'''
predict_entities
=
set
()
if
mask
is
not
None
:
# 预测的把query和padding部分mask掉
start_preds
=
torch
.
argmax
(
start_preds
,
-
1
)
*
mask
end_preds
=
torch
.
argmax
(
end_preds
,
-
1
)
*
mask
start_preds
=
start_preds
.
cpu
().
numpy
()
end_preds
=
end_preds
.
cpu
().
numpy
()
for
bt_i
in
range
(
start_preds
.
shape
[
0
]):
start_pred
=
start_preds
[
bt_i
]
end_pred
=
end_preds
[
bt_i
]
# 统计每个样本的结果
for
i
,
s_type
in
enumerate
(
start_pred
):
if
s_type
==
0
:
continue
for
j
,
e_type
in
enumerate
(
end_pred
[
i
:]):
if
s_type
==
e_type
:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities
.
add
((
bt_i
,
i
,
i
+
j
,
ent_type
[
bt_i
]))
break
return
predict_entities
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_span.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# span阅读理解方案
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]: 96.31
import
numpy
as
np
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.losses
import
FocalLoss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
tqdm
import
tqdm
max_len
=
256
batch_size
=
16
categories
=
[
'LOC'
,
'PER'
,
'ORG'
]
categories_id2label
=
{
i
:
k
for
i
,
k
in
enumerate
(
categories
,
start
=
1
)}
categories_label2id
=
{
k
:
i
for
i
,
k
in
enumerate
(
categories
,
start
=
1
)}
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
D
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
d
=
[
''
]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
d
[
0
]
+=
char
if
flag
[
0
]
==
'B'
:
d
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
d
[
-
1
][
1
]
=
i
D
.
append
(
d
)
return
D
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
batch_token_ids
,
batch_start_labels
,
batch_end_labels
=
[],
[],
[]
for
d
in
batch
:
tokens
=
tokenizer
.
tokenize
(
d
[
0
],
maxlen
=
max_len
)[
1
:]
# 不保留[CLS]
mapping
=
tokenizer
.
rematch
(
d
[
0
],
tokens
)
start_mapping
=
{
j
[
0
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
end_mapping
=
{
j
[
-
1
]:
i
for
i
,
j
in
enumerate
(
mapping
)
if
j
}
token_ids
=
tokenizer
.
tokens_to_ids
(
tokens
)
start_ids
=
[
0
]
*
len
(
tokens
)
end_ids
=
[
0
]
*
len
(
tokens
)
for
start
,
end
,
label
in
d
[
1
:]:
if
start
in
start_mapping
and
end
in
end_mapping
:
start
=
start_mapping
[
start
]
end
=
end_mapping
[
end
]
start_ids
[
start
]
=
categories_label2id
[
label
]
end_ids
[
end
]
=
categories_label2id
[
label
]
batch_token_ids
.
append
(
token_ids
)
batch_start_labels
.
append
(
start_ids
)
batch_end_labels
.
append
(
end_ids
)
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_start_labels
=
torch
.
tensor
(
sequence_padding
(
batch_start_labels
),
dtype
=
torch
.
long
,
device
=
device
)
batch_end_labels
=
torch
.
tensor
(
sequence_padding
(
batch_end_labels
),
dtype
=
torch
.
long
,
device
=
device
)
batch_mask
=
batch_token_ids
.
gt
(
0
).
long
()
return
[
batch_token_ids
],
[
batch_mask
,
batch_start_labels
,
batch_end_labels
]
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
mid_linear
=
nn
.
Sequential
(
nn
.
Linear
(
768
,
128
),
nn
.
ReLU
(),
nn
.
Dropout
(
0.1
)
)
self
.
start_fc
=
nn
.
Linear
(
128
,
len
(
categories
)
+
1
)
# 0表示没有
self
.
end_fc
=
nn
.
Linear
(
128
,
len
(
categories
)
+
1
)
def
forward
(
self
,
token_ids
):
sequence_output
=
self
.
bert
(
token_ids
)
# [bts, seq_len, hdsz]
seq_out
=
self
.
mid_linear
(
sequence_output
)
# [bts, seq_len, mid_dims]
start_logits
=
self
.
start_fc
(
seq_out
)
# [bts, seq_len, num_tags]
end_logits
=
self
.
end_fc
(
seq_out
)
# [bts, seq_len, num_tags]
return
start_logits
,
end_logits
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
CrossEntropyLoss
):
def
forward
(
self
,
outputs
,
labels
):
start_logits
,
end_logits
=
outputs
mask
,
start_ids
,
end_ids
=
labels
start_logits
=
start_logits
.
view
(
-
1
,
len
(
categories
)
+
1
)
end_logits
=
end_logits
.
view
(
-
1
,
len
(
categories
)
+
1
)
# 去掉padding部分的标签,计算真实 loss
active_loss
=
mask
.
view
(
-
1
)
==
1
active_start_logits
=
start_logits
[
active_loss
]
active_end_logits
=
end_logits
[
active_loss
]
active_start_labels
=
start_ids
.
view
(
-
1
)[
active_loss
]
active_end_labels
=
end_ids
.
view
(
-
1
)[
active_loss
]
start_loss
=
super
().
forward
(
active_start_logits
,
active_start_labels
)
end_loss
=
super
().
forward
(
active_end_logits
,
active_end_labels
)
return
start_loss
+
end_loss
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
):
X
,
Y
,
Z
=
0
,
1e-10
,
1e-10
for
token_ids
,
labels
in
tqdm
(
data
,
desc
=
'Evaluation'
):
start_logit
,
end_logit
=
model
.
predict
(
token_ids
)
# [btz, seq_len, 2]
mask
,
start_ids
,
end_ids
=
labels
# entity粒度
entity_pred
=
span_decode
(
start_logit
,
end_logit
,
mask
)
entity_true
=
span_decode
(
start_ids
,
end_ids
)
X
+=
len
(
entity_pred
.
intersection
(
entity_true
))
Y
+=
len
(
entity_pred
)
Z
+=
len
(
entity_true
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
# 严格解码 baseline
def
span_decode
(
start_preds
,
end_preds
,
mask
=
None
):
'''返回实体的start, end
'''
predict_entities
=
set
()
if
mask
is
not
None
:
# 把padding部分mask掉
start_preds
=
torch
.
argmax
(
start_preds
,
-
1
)
*
mask
end_preds
=
torch
.
argmax
(
end_preds
,
-
1
)
*
mask
start_preds
=
start_preds
.
cpu
().
numpy
()
end_preds
=
end_preds
.
cpu
().
numpy
()
for
bt_i
in
range
(
start_preds
.
shape
[
0
]):
start_pred
=
start_preds
[
bt_i
]
end_pred
=
end_preds
[
bt_i
]
# 统计每个样本的结果
for
i
,
s_type
in
enumerate
(
start_pred
):
if
s_type
==
0
:
continue
for
j
,
e_type
in
enumerate
(
end_pred
[
i
:]):
if
s_type
==
e_type
:
# [样本id, 实体起点,实体终点,实体类型]
predict_entities
.
add
((
bt_i
,
i
,
i
+
j
,
categories_id2label
[
s_type
]))
break
return
predict_entities
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/task_sequence_labeling_ner_tplinker_plus.py
0 → 100644
View file @
66a1d0d0
#! -*- coding:utf-8 -*-
# tplinker_plus用来做实体识别
# [valid_f1]: 95.71
import
numpy
as
np
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
MultilabelCategoricalCrossentropy
from
bert4torch.layers
import
TplinkerHandshakingKernel
maxlen
=
64
batch_size
=
16
categories_label2id
=
{
"LOC"
:
0
,
"ORG"
:
1
,
"PER"
:
2
}
categories_id2label
=
dict
((
value
,
key
)
for
key
,
value
in
categories_label2id
.
items
())
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 固定seed
seed_everything
(
42
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filename
):
data
=
[]
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
f
=
f
.
read
()
for
l
in
f
.
split
(
'
\n\n
'
):
if
not
l
:
continue
text
,
label
=
''
,
[]
for
i
,
c
in
enumerate
(
l
.
split
(
'
\n
'
)):
char
,
flag
=
c
.
split
(
' '
)
text
+=
char
if
flag
[
0
]
==
'B'
:
label
.
append
([
i
,
i
,
flag
[
2
:]])
elif
flag
[
0
]
==
'I'
:
label
[
-
1
][
1
]
=
i
text_list
=
tokenizer
.
tokenize
(
text
)[
1
:
-
1
]
#不保留首位[CLS]和末位[SEP]
tokens
=
[
j
for
i
in
text_list
for
j
in
i
][:
maxlen
]
# 以char为单位
data
.
append
((
tokens
,
label
))
# label为[[start, end, entity], ...]
return
data
def
trans_ij2k
(
seq_len
,
i
,
j
):
'''把第i行,第j列转化成上三角flat后的序号
'''
if
(
i
>
seq_len
-
1
)
or
(
j
>
seq_len
-
1
)
or
(
i
>
j
):
return
0
return
int
(
0.5
*
(
2
*
seq_len
-
i
+
1
)
*
i
+
(
j
-
i
))
map_ij2k
=
{(
i
,
j
):
trans_ij2k
(
maxlen
,
i
,
j
)
for
i
in
range
(
maxlen
)
for
j
in
range
(
maxlen
)
if
j
>=
i
}
map_k2ij
=
{
v
:
k
for
k
,
v
in
map_ij2k
.
items
()}
def
tran_ent_rel2id
():
'''获取最后一个分类层的的映射关系
'''
tag2id
=
{}
for
p
in
categories_label2id
.
keys
():
tag2id
[
p
]
=
len
(
tag2id
)
return
tag2id
tag2id
=
tran_ent_rel2id
()
id2tag
=
{
v
:
k
for
k
,
v
in
tag2id
.
items
()}
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
collate_fn
(
batch
):
pair_len
=
maxlen
*
(
maxlen
+
1
)
//
2
# batch_head_labels: [btz, pair_len, tag2id_len]
batch_labels
=
torch
.
zeros
((
len
(
batch
),
pair_len
,
len
(
tag2id
)),
dtype
=
torch
.
long
,
device
=
device
)
batch_token_ids
=
[]
for
i
,
(
tokens
,
labels
)
in
enumerate
(
batch
):
batch_token_ids
.
append
(
tokenizer
.
tokens_to_ids
(
tokens
))
# 前面已经限制了长度
for
s_i
in
labels
:
if
s_i
[
1
]
>=
len
(
tokens
):
# 实体的结尾超过文本长度,则不标记
continue
batch_labels
[
i
,
map_ij2k
[
s_i
[
0
],
s_i
[
1
]],
tag2id
[
s_i
[
2
]]]
=
1
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
,
length
=
maxlen
),
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
],
batch_labels
# 转换数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
(
'F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev'
),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
)
self
.
fc
=
nn
.
Linear
(
768
,
len
(
tag2id
))
self
.
handshaking_kernel
=
TplinkerHandshakingKernel
(
768
,
shaking_type
=
'cln_plus'
,
inner_enc_type
=
'lstm'
)
def
forward
(
self
,
inputs
):
last_hidden_state
=
self
.
bert
(
inputs
)
# [btz, seq_len, hdsz]
shaking_hiddens
=
self
.
handshaking_kernel
(
last_hidden_state
)
output
=
self
.
fc
(
shaking_hiddens
)
# [btz, pair_len, tag_size]
return
output
model
=
Model
().
to
(
device
)
model
.
compile
(
loss
=
MultilabelCategoricalCrossentropy
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
))
def
evaluate
(
data
,
threshold
=
0
):
X
,
Y
,
Z
,
threshold
=
0
,
1e-10
,
1e-10
,
0
for
x_true
,
label
in
data
:
scores
=
model
.
predict
(
x_true
)
# [btz, pair_len, tag_size]
for
i
,
score
in
enumerate
(
scores
):
R
=
set
()
for
pair_id
,
tag_id
in
zip
(
*
np
.
where
(
score
.
cpu
().
numpy
()
>
threshold
)):
start
,
end
=
map_k2ij
[
pair_id
][
0
],
map_k2ij
[
pair_id
][
1
]
R
.
add
((
start
,
end
,
tag_id
))
T
=
set
()
for
pair_id
,
tag_id
in
zip
(
*
np
.
where
(
label
[
i
].
cpu
().
numpy
()
>
threshold
)):
start
,
end
=
map_k2ij
[
pair_id
][
0
],
map_k2ij
[
pair_id
][
1
]
T
.
add
((
start
,
end
,
tag_id
))
X
+=
len
(
R
&
T
)
Y
+=
len
(
R
)
Z
+=
len
(
T
)
f1
,
precision
,
recall
=
2
*
X
/
(
Y
+
Z
),
X
/
Y
,
X
/
Z
return
f1
,
precision
,
recall
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_f1
=
0.
def
on_epoch_end
(
self
,
steps
,
epoch
,
logs
=
None
):
f1
,
precision
,
recall
=
evaluate
(
valid_dataloader
)
if
f1
>
self
.
best_val_f1
:
self
.
best_val_f1
=
f1
# model.save_weights('best_model.pt')
print
(
f
'[val] f1:
{
f1
:.
5
f
}
, p:
{
precision
:.
5
f
}
r:
{
recall
:.
5
f
}
best_f1:
{
self
.
best_val_f1
:.
5
f
}
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/sequence_labeling/uie/convert.py
0 → 100644
View file @
66a1d0d0
import
argparse
import
collections
import
json
import
os
import
pickle
import
torch
import
logging
import
shutil
from
tqdm
import
tqdm
import
time
logger
=
logging
.
Logger
(
'log'
)
def
get_path_from_url
(
url
,
root_dir
,
check_exist
=
True
,
decompress
=
True
):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url and decompress it, return the path.
Args:
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME or DATASET_HOME
decompress (bool): decompress zip or tar file. Default is `True`
Returns:
str: a local path to save downloaded models & weights & datasets.
"""
import
os.path
import
os
import
tarfile
import
zipfile
def
is_url
(
path
):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return
path
.
startswith
(
'http://'
)
or
path
.
startswith
(
'https://'
)
def
_map_path
(
url
,
root_dir
):
# parse path after download under root_dir
fname
=
os
.
path
.
split
(
url
)[
-
1
]
fpath
=
fname
return
os
.
path
.
join
(
root_dir
,
fpath
)
def
_get_download
(
url
,
fullname
):
import
requests
# using requests.get method
fname
=
os
.
path
.
basename
(
fullname
)
try
:
req
=
requests
.
get
(
url
,
stream
=
True
)
except
Exception
as
e
:
# requests.exceptions.ConnectionError
logger
.
info
(
"Downloading {} from {} failed with exception {}"
.
format
(
fname
,
url
,
str
(
e
)))
return
False
if
req
.
status_code
!=
200
:
raise
RuntimeError
(
"Downloading from {} failed with code "
"{}!"
.
format
(
url
,
req
.
status_code
))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname
=
fullname
+
"_tmp"
total_size
=
req
.
headers
.
get
(
'content-length'
)
with
open
(
tmp_fullname
,
'wb'
)
as
f
:
if
total_size
:
with
tqdm
(
total
=
(
int
(
total_size
)
+
1023
)
//
1024
,
unit
=
'KB'
)
as
pbar
:
for
chunk
in
req
.
iter_content
(
chunk_size
=
1024
):
f
.
write
(
chunk
)
pbar
.
update
(
1
)
else
:
for
chunk
in
req
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
f
.
write
(
chunk
)
shutil
.
move
(
tmp_fullname
,
fullname
)
return
fullname
def
_download
(
url
,
path
):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if
not
os
.
path
.
exists
(
path
):
os
.
makedirs
(
path
)
fname
=
os
.
path
.
split
(
url
)[
-
1
]
fullname
=
os
.
path
.
join
(
path
,
fname
)
retry_cnt
=
0
logger
.
info
(
"Downloading {} from {}"
.
format
(
fname
,
url
))
DOWNLOAD_RETRY_LIMIT
=
3
while
not
os
.
path
.
exists
(
fullname
):
if
retry_cnt
<
DOWNLOAD_RETRY_LIMIT
:
retry_cnt
+=
1
else
:
raise
RuntimeError
(
"Download from {} failed. "
"Retry limit reached"
.
format
(
url
))
if
not
_get_download
(
url
,
fullname
):
time
.
sleep
(
1
)
continue
return
fullname
def
_uncompress_file_zip
(
filepath
):
with
zipfile
.
ZipFile
(
filepath
,
'r'
)
as
files
:
file_list
=
files
.
namelist
()
file_dir
=
os
.
path
.
dirname
(
filepath
)
if
_is_a_single_file
(
file_list
):
rootpath
=
file_list
[
0
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
files
.
extractall
(
file_dir
)
elif
_is_a_single_dir
(
file_list
):
# `strip(os.sep)` to remove `os.sep` in the tail of path
rootpath
=
os
.
path
.
splitext
(
file_list
[
0
].
strip
(
os
.
sep
))[
0
].
split
(
os
.
sep
)[
-
1
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
files
.
extractall
(
file_dir
)
else
:
rootpath
=
os
.
path
.
splitext
(
filepath
)[
0
].
split
(
os
.
sep
)[
-
1
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
if
not
os
.
path
.
exists
(
uncompressed_path
):
os
.
makedirs
(
uncompressed_path
)
files
.
extractall
(
os
.
path
.
join
(
file_dir
,
rootpath
))
return
uncompressed_path
def
_is_a_single_file
(
file_list
):
if
len
(
file_list
)
==
1
and
file_list
[
0
].
find
(
os
.
sep
)
<
0
:
return
True
return
False
def
_is_a_single_dir
(
file_list
):
new_file_list
=
[]
for
file_path
in
file_list
:
if
'/'
in
file_path
:
file_path
=
file_path
.
replace
(
'/'
,
os
.
sep
)
elif
'
\\
'
in
file_path
:
file_path
=
file_path
.
replace
(
'
\\
'
,
os
.
sep
)
new_file_list
.
append
(
file_path
)
file_name
=
new_file_list
[
0
].
split
(
os
.
sep
)[
0
]
for
i
in
range
(
1
,
len
(
new_file_list
)):
if
file_name
!=
new_file_list
[
i
].
split
(
os
.
sep
)[
0
]:
return
False
return
True
def
_uncompress_file_tar
(
filepath
,
mode
=
"r:*"
):
with
tarfile
.
open
(
filepath
,
mode
)
as
files
:
file_list
=
files
.
getnames
()
file_dir
=
os
.
path
.
dirname
(
filepath
)
if
_is_a_single_file
(
file_list
):
rootpath
=
file_list
[
0
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
files
.
extractall
(
file_dir
)
elif
_is_a_single_dir
(
file_list
):
rootpath
=
os
.
path
.
splitext
(
file_list
[
0
].
strip
(
os
.
sep
))[
0
].
split
(
os
.
sep
)[
-
1
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
files
.
extractall
(
file_dir
)
else
:
rootpath
=
os
.
path
.
splitext
(
filepath
)[
0
].
split
(
os
.
sep
)[
-
1
]
uncompressed_path
=
os
.
path
.
join
(
file_dir
,
rootpath
)
if
not
os
.
path
.
exists
(
uncompressed_path
):
os
.
makedirs
(
uncompressed_path
)
files
.
extractall
(
os
.
path
.
join
(
file_dir
,
rootpath
))
return
uncompressed_path
def
_decompress
(
fname
):
"""
Decompress for zip and tar file
"""
logger
.
info
(
"Decompressing {}..."
.
format
(
fname
))
# For protecting decompressing interupted,
# decompress to fpath_tmp directory firstly, if decompress
# successed, move decompress files to fpath and delete
# fpath_tmp and remove download compress file.
if
tarfile
.
is_tarfile
(
fname
):
uncompressed_path
=
_uncompress_file_tar
(
fname
)
elif
zipfile
.
is_zipfile
(
fname
):
uncompressed_path
=
_uncompress_file_zip
(
fname
)
else
:
raise
TypeError
(
"Unsupport compress file type {}"
.
format
(
fname
))
return
uncompressed_path
assert
is_url
(
url
),
"downloading from {} not a url"
.
format
(
url
)
fullpath
=
_map_path
(
url
,
root_dir
)
if
os
.
path
.
exists
(
fullpath
)
and
check_exist
:
logger
.
info
(
"Found {}"
.
format
(
fullpath
))
else
:
fullpath
=
_download
(
url
,
root_dir
)
if
decompress
and
(
tarfile
.
is_tarfile
(
fullpath
)
or
zipfile
.
is_zipfile
(
fullpath
)):
fullpath
=
_decompress
(
fullpath
)
return
fullpath
MODEL_MAP
=
{
"uie-base"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v0.1/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
}
},
"uie-medium"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.0/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
,
}
},
"uie-mini"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini_v1.0/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
,
}
},
"uie-micro"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro_v1.0/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
,
}
},
"uie-nano"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano_v1.0/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
,
}
},
"uie-medical-base"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medical_base_v0.1/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json"
,
}
},
"uie-tiny"
:
{
"resource_file_urls"
:
{
"model_state.pdparams"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny_v0.1/model_state.pdparams"
,
"model_config.json"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json"
,
"vocab_file"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/vocab.txt"
,
"special_tokens_map"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/special_tokens_map.json"
,
"tokenizer_config"
:
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/tokenizer_config.json"
}
}
}
def
build_params_map
(
attention_num
=
12
):
"""
build params map from paddle-paddle's ERNIE to transformer's BERT
:return:
"""
weight_map
=
collections
.
OrderedDict
({
'encoder.embeddings.word_embeddings.weight'
:
"bert.embeddings.word_embeddings.weight"
,
'encoder.embeddings.position_embeddings.weight'
:
"bert.embeddings.position_embeddings.weight"
,
'encoder.embeddings.token_type_embeddings.weight'
:
"bert.embeddings.token_type_embeddings.weight"
,
'encoder.embeddings.task_type_embeddings.weight'
:
"embeddings.task_type_embeddings.weight"
,
# 这里没有前缀bert,直接映射到bert4torch结构
'encoder.embeddings.layer_norm.weight'
:
'bert.embeddings.LayerNorm.weight'
,
'encoder.embeddings.layer_norm.bias'
:
'bert.embeddings.LayerNorm.bias'
,
})
# add attention layers
for
i
in
range
(
attention_num
):
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.q_proj.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.query.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.q_proj.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.query.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.k_proj.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.key.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.k_proj.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.key.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.v_proj.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.value.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.v_proj.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.self.value.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.out_proj.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.output.dense.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.self_attn.out_proj.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.output.dense.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.norm1.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.output.LayerNorm.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.norm1.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.attention.output.LayerNorm.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.linear1.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.intermediate.dense.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.linear1.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.intermediate.dense.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.linear2.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.output.dense.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.linear2.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.output.dense.bias'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.norm2.weight'
]
=
f
'bert.encoder.layer.
{
i
}
.output.LayerNorm.weight'
weight_map
[
f
'encoder.encoder.layers.
{
i
}
.norm2.bias'
]
=
f
'bert.encoder.layer.
{
i
}
.output.LayerNorm.bias'
# add pooler
weight_map
.
update
(
{
'encoder.pooler.dense.weight'
:
'bert.pooler.dense.weight'
,
'encoder.pooler.dense.bias'
:
'bert.pooler.dense.bias'
,
'linear_start.weight'
:
'linear_start.weight'
,
'linear_start.bias'
:
'linear_start.bias'
,
'linear_end.weight'
:
'linear_end.weight'
,
'linear_end.bias'
:
'linear_end.bias'
,
}
)
return
weight_map
def
extract_and_convert
(
input_dir
,
output_dir
):
if
not
os
.
path
.
exists
(
output_dir
):
os
.
makedirs
(
output_dir
)
logger
.
info
(
'='
*
20
+
'save config file'
+
'='
*
20
)
config
=
json
.
load
(
open
(
os
.
path
.
join
(
input_dir
,
'model_config.json'
),
'rt'
,
encoding
=
'utf-8'
))
config
=
config
[
'init_args'
][
0
]
config
[
"architectures"
]
=
[
"UIE"
]
config
[
'layer_norm_eps'
]
=
1e-12
del
config
[
'init_class'
]
if
'sent_type_vocab_size'
in
config
:
config
[
'type_vocab_size'
]
=
config
[
'sent_type_vocab_size'
]
config
[
'intermediate_size'
]
=
4
*
config
[
'hidden_size'
]
json
.
dump
(
config
,
open
(
os
.
path
.
join
(
output_dir
,
'config.json'
),
'wt'
,
encoding
=
'utf-8'
),
indent
=
4
)
logger
.
info
(
'='
*
20
+
'save vocab file'
+
'='
*
20
)
with
open
(
os
.
path
.
join
(
input_dir
,
'vocab.txt'
),
'rt'
,
encoding
=
'utf-8'
)
as
f
:
words
=
f
.
read
().
splitlines
()
words_set
=
set
()
words_duplicate_indices
=
[]
for
i
in
range
(
len
(
words
)
-
1
,
-
1
,
-
1
):
word
=
words
[
i
]
if
word
in
words_set
:
words_duplicate_indices
.
append
(
i
)
words_set
.
add
(
word
)
for
i
,
idx
in
enumerate
(
words_duplicate_indices
):
words
[
idx
]
=
chr
(
0x1F6A9
+
i
)
# Change duplicated word to 🚩 LOL
with
open
(
os
.
path
.
join
(
output_dir
,
'vocab.txt'
),
'wt'
,
encoding
=
'utf-8'
)
as
f
:
for
word
in
words
:
f
.
write
(
word
+
'
\n
'
)
special_tokens_map
=
{
"unk_token"
:
"[UNK]"
,
"sep_token"
:
"[SEP]"
,
"pad_token"
:
"[PAD]"
,
"cls_token"
:
"[CLS]"
,
"mask_token"
:
"[MASK]"
}
json
.
dump
(
special_tokens_map
,
open
(
os
.
path
.
join
(
output_dir
,
'special_tokens_map.json'
),
'wt'
,
encoding
=
'utf-8'
))
tokenizer_config
=
{
"do_lower_case"
:
True
,
"unk_token"
:
"[UNK]"
,
"sep_token"
:
"[SEP]"
,
"pad_token"
:
"[PAD]"
,
"cls_token"
:
"[CLS]"
,
"mask_token"
:
"[MASK]"
,
"tokenizer_class"
:
"BertTokenizer"
}
json
.
dump
(
tokenizer_config
,
open
(
os
.
path
.
join
(
output_dir
,
'tokenizer_config.json'
),
'wt'
,
encoding
=
'utf-8'
))
logger
.
info
(
'='
*
20
+
'extract weights'
+
'='
*
20
)
state_dict
=
collections
.
OrderedDict
()
weight_map
=
build_params_map
(
attention_num
=
config
[
'num_hidden_layers'
])
paddle_paddle_params
=
pickle
.
load
(
open
(
os
.
path
.
join
(
input_dir
,
'model_state.pdparams'
),
'rb'
))
del
paddle_paddle_params
[
'StructuredToParameterName@@'
]
for
weight_name
,
weight_value
in
paddle_paddle_params
.
items
():
if
'weight'
in
weight_name
:
if
'encoder.encoder'
in
weight_name
or
'pooler'
in
weight_name
or
'linear'
in
weight_name
:
weight_value
=
weight_value
.
transpose
()
# Fix: embedding error
if
'word_embeddings.weight'
in
weight_name
:
weight_value
[
0
,
:]
=
0
if
weight_name
not
in
weight_map
:
logger
.
info
(
f
"
{
'='
*
20
}
[SKIP]
{
weight_name
}
{
'='
*
20
}
"
)
continue
state_dict
[
weight_map
[
weight_name
]]
=
torch
.
FloatTensor
(
weight_value
)
logger
.
info
(
f
"
{
weight_name
}
->
{
weight_map
[
weight_name
]
}
{
weight_value
.
shape
}
"
)
torch
.
save
(
state_dict
,
os
.
path
.
join
(
output_dir
,
"pytorch_model.bin"
))
def
check_model
(
input_model
):
if
not
os
.
path
.
exists
(
input_model
):
if
input_model
not
in
MODEL_MAP
:
raise
ValueError
(
'input_model not exists!'
)
resource_file_urls
=
MODEL_MAP
[
input_model
][
'resource_file_urls'
]
logger
.
info
(
"Downloading resource files..."
)
for
key
,
val
in
resource_file_urls
.
items
():
file_path
=
os
.
path
.
join
(
input_model
,
key
)
if
not
os
.
path
.
exists
(
file_path
):
get_path_from_url
(
val
,
input_model
)
def
do_main
():
check_model
(
args
.
input_model
)
extract_and_convert
(
args
.
input_model
,
args
.
output_model
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-i"
,
"--input_model"
,
default
=
"uie-base"
,
type
=
str
,
help
=
"Directory of input paddle model.
\n
Will auto download model [uie-base/uie-tiny]"
)
parser
.
add_argument
(
"-o"
,
"--output_model"
,
default
=
"uie_base_pytorch"
,
type
=
str
,
help
=
"Directory of output pytorch model"
)
args
=
parser
.
parse_args
()
do_main
()
Prev
1
…
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment