Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
bert
Commits
19a23d09
Commit
19a23d09
authored
Jun 19, 2024
by
wangsen
Browse files
Initial commit
parents
Pipeline
#1247
failed with stages
in 0 seconds
Changes
172
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1569 additions
and
0 deletions
+1569
-0
examples/training_trick/task_sentiment_TemporalEnsembling.py
examples/training_trick/task_sentiment_TemporalEnsembling.py
+115
-0
examples/training_trick/task_sentiment_UDA.py
examples/training_trick/task_sentiment_UDA.py
+138
-0
examples/training_trick/task_sentiment_adversarial_training.py
...les/training_trick/task_sentiment_adversarial_training.py
+123
-0
examples/training_trick/task_sentiment_exponential_moving_average.py
...aining_trick/task_sentiment_exponential_moving_average.py
+112
-0
examples/training_trick/task_sentiment_exponential_moving_average_warmup.py
...trick/task_sentiment_exponential_moving_average_warmup.py
+114
-0
examples/training_trick/task_sentiment_mixup.py
examples/training_trick/task_sentiment_mixup.py
+125
-0
examples/training_trick/task_sentiment_virtual_adversarial_training.py
...ning_trick/task_sentiment_virtual_adversarial_training.py
+128
-0
examples/tutorials/Tutorials.md
examples/tutorials/Tutorials.md
+320
-0
examples/tutorials/tutorials_custom_fit_progress.py
examples/tutorials/tutorials_custom_fit_progress.py
+122
-0
examples/tutorials/tutorials_load_transformers_model.py
examples/tutorials/tutorials_load_transformers_model.py
+107
-0
examples/tutorials/tutorials_small_tips.py
examples/tutorials/tutorials_small_tips.py
+149
-0
setup.py
setup.py
+16
-0
No files found.
examples/training_trick/task_sentiment_TemporalEnsembling.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 通过TemporalEnsembling提升模型泛化
# 官方项目:https://github.com/s-laine/tempens
# pytorch第三方实现:https://github.com/ferretj/temporal-ensembling
# 数据集:情感分类数据集
# 本示例是把监督数据当成无监督数据使用
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
seed_everything
,
text_segmentate
,
get_pool_emb
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.losses
import
TemporalEnsemblingLoss
maxlen
=
256
batch_size
=
16
epochs
=
10
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集,训练数据集shuffle=False
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
False
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
segment_vocab_size
=
0
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
class
MyLoss
(
TemporalEnsemblingLoss
):
def
forward
(
self
,
y_pred
,
y_true
):
# 监督数据当成无监督数据使用,真实场景中可以用大量的无监督数据来使用
y_pred_sup
,
y_pred_unsup
,
y_true_sup
=
y_pred
,
y_pred
,
y_true
return
super
().
forward
(
y_pred_sup
,
y_pred_unsup
,
y_true_sup
,
model
.
epoch
,
model
.
bti
)
loss
=
MyLoss
(
epochs
=
epochs
,
max_batch_num
=
None
)
model
.
compile
(
loss
=
loss
,
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
])
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
epochs
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_UDA.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 以文本分类(情感分类)为例的半监督学习UDA策略,https://arxiv.org/abs/1904.12848
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
from
bert4torch.losses
import
UDALoss
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
import
numpy
as
np
import
random
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
train_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
])
valid_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
])
test_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset
=
[
sen
for
sen
,
_
in
(
train_dataset
.
data
+
valid_dataset
.
data
+
test_dataset
.
data
)]
def
collate_fn
(
batch
):
def
add_noise
(
token_ids
,
del_ratio
=
0.3
):
'''这里用随机删除做简单示例,实际中可以使用增删改等多种noise方案
'''
n
=
len
(
token_ids
)
keep_or_not
=
np
.
random
.
rand
(
n
)
>
del_ratio
if
sum
(
keep_or_not
)
==
0
:
keep_or_not
[
np
.
random
.
choice
(
n
)]
=
True
# guarantee that at least one word remains
return
list
(
np
.
array
(
token_ids
)[
keep_or_not
])
# batch_token_ids包含三部分,第一部分是有监督数据,第二部分是领域类的无监督数据,第三部分是无监督数据经数据增强后的数据
batch_token_ids
,
batch_labels
=
[[],
[],
[]],
[]
for
text
,
label
in
batch
:
token_ids
,
_
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
[
0
].
append
(
token_ids
)
batch_labels
.
append
([
label
])
# 无监督部分
unsup_text
=
random
.
choice
(
unsup_dataset
)
# 随机挑一个无监督数据
token_ids
,
_
=
tokenizer
.
encode
(
unsup_text
,
maxlen
=
maxlen
)
batch_token_ids
[
1
].
append
(
token_ids
)
batch_token_ids
[
2
].
append
(
token_ids
[:
1
]
+
add_noise
(
token_ids
[
1
:
-
1
])
+
token_ids
[
-
1
:])
# 无监督数据增强
batch_token_ids
=
[
j
for
i
in
batch_token_ids
for
j
in
i
]
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
test_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
class
Loss
(
UDALoss
):
def
forward
(
self
,
y_pred
,
y_true_sup
):
loss
,
loss_sup
,
loss_unsup
=
super
().
forward
(
y_pred
,
y_true_sup
,
model
.
global_step
,
model
.
total_steps
)
return
{
'loss'
:
loss
,
'loss_sup'
:
loss_sup
,
'loss_unsup'
:
loss_unsup
}
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
Loss
(
tsa_schedule
=
'linear_schedule'
,
start_p
=
0.8
),
# 这里可换用不同的策略, 不为None时候要给定model
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'loss_sup'
,
'loss_unsup'
]
# Loss返回的key会自动计入metrics,下述metrics不写仍可以打印具体的Loss
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
token_ids
,
y_true
in
data
:
y_pred
=
model
.
predict
(
token_ids
[:
y_true
.
size
(
0
)]).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_adversarial_training.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 通过对抗训/梯度惩罚练增强模型的泛化性能,包含fgm, pgs, vat,梯度惩罚
# 数据集:情感分类数据集
# 对抗训练:https://kexue.fm/archives/7234
# 虚拟对抗训练:https://kexue.fm/archives/7466
# 梯度惩罚:https://kexue.fm/archives/7234
from
bert4torch.models
import
build_transformer_model
,
BaseModel
import
torch
from
torch.utils.data
import
DataLoader
import
torch.nn
as
nn
import
torch.optim
as
optim
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
ListDataset
,
text_segmentate
,
get_pool_emb
,
seed_everything
from
bert4torch.tokenizers
import
Tokenizer
import
sys
maxlen
=
256
batch_size
=
16
# BERT base
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 传参方式
mode
=
sys
.
argv
[
1
]
adversarial_train
=
{
'name'
:
mode
}
print
(
f
'Using
{
mode
}
'
.
center
(
60
,
'='
))
# debug方式
# 具体参数设置可以到bert4torch.models/bert4torch.snippets里
# adversarial_train = {'name': 'fgm'} # fgm方式
# adversarial_train = {'name': 'pgd'} # pgd方式
# adversarial_train = {'name': 'gradient_penalty'} # 梯度惩罚
# adversarial_train = {'name': 'vat'} # 虚拟对抗,这里仅为使用有监督数据的示例
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
],
adversarial_train
=
adversarial_train
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_exponential_moving_average.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 情感分类任务, 指数滑动平均
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
from
bert4torch.optimizers
import
extend_with_exponential_moving_average
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
import
random
,
os
,
numpy
as
np
from
torch.utils.data
import
DataLoader
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
ema_schedule
=
extend_with_exponential_moving_average
(
model
,
decay
=
0.99
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
scheduler
=
ema_schedule
,
metrics
=
[
'accuracy'
]
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
ema_schedule
.
apply_ema_weights
()
# 使用滑动平均的ema权重
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
ema_schedule
.
restore_raw_weights
()
# 恢复原来模型的参数
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_exponential_moving_average_warmup.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 情感分类任务, 指数滑动平均ema+warmup两种策略
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
from
bert4torch.optimizers
import
extend_with_exponential_moving_average
,
get_linear_schedule_with_warmup
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
ema_schedule
=
extend_with_exponential_moving_average
(
model
,
decay
=
0.99
)
warmup_scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
len
(
train_dataloader
),
num_training_steps
=
len
(
train_dataloader
)
*
10
,
last_epoch
=-
1
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optimizer
,
scheduler
=
[
ema_schedule
,
warmup_scheduler
],
metrics
=
[
'accuracy'
]
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
ema_schedule
.
apply_ema_weights
()
# 使用滑动平均的ema权重
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
ema_schedule
.
restore_raw_weights
()
# 恢复原来模型的参数
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_mixup.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# Mixup策略,包含embedding,hidden, encoder的mixup
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.layers
import
MixUp
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
choice
=
'train'
# train表示训练,infer表示推理
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_labels
=
[],
[]
for
text
,
label
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
batch_token_ids
.
append
(
token_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
mixup_method
=
'encoder'
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
self
.
mixup
=
MixUp
(
method
=
mixup_method
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
mixup
.
encode
(
self
.
bert
,
[
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
y_pred
=
self
.
dense
(
output
)
return
y_pred
def
predict
(
self
,
token_ids
):
self
.
eval
()
with
torch
.
no_grad
():
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
y_pred
=
self
.
dense
(
output
)
return
y_pred
model
=
Model
().
to
(
device
)
class
Loss
(
nn
.
Module
):
def
forward
(
self
,
y_pred
,
y_true
):
return
model
.
mixup
(
nn
.
CrossEntropyLoss
(),
y_pred
,
y_true
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
Loss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/training_trick/task_sentiment_virtual_adversarial_training.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 以文本分类为例的半监督学习,虚拟对抗训练策略
# 监督数据部分只计算监督Loss, 有监督+无监督数据计算对抗训练的Loss
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
torch.utils.data
import
DataLoader
import
random
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
train_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
])
valid_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
])
test_dataset
=
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset
=
[
sen
for
sen
,
_
in
(
train_dataset
.
data
+
valid_dataset
.
data
+
test_dataset
.
data
)]
def
collate_fn
(
batch
):
# batch_token_ids包含两部部分,第一部分是有监督数据,第二部分是无监督数据
batch_token_ids
,
batch_labels
=
[[],
[]],
[]
for
text
,
label
in
batch
:
token_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)[
0
]
batch_token_ids
[
0
].
append
(
token_ids
)
batch_labels
.
append
([
label
])
# 无监督部分
unsup_text
=
random
.
choice
(
unsup_dataset
)
# 随机挑一个无监督数据
token_ids
,
_
=
tokenizer
.
encode
(
unsup_text
,
maxlen
=
maxlen
)
batch_token_ids
[
1
].
append
(
token_ids
)
batch_token_ids
=
[
j
for
i
in
batch_token_ids
for
j
in
i
]
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
batch_token_ids
,
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
valid_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
test_dataset
,
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
):
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
,
segment_vocab_size
=
0
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
[
0
].
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
class
MyLoss
(
nn
.
Module
):
def
forward
(
self
,
y_pred
,
y_true_sup
):
y_pred_sup
=
y_pred
[:
y_true_sup
.
shape
[
0
]]
# 仅计算监督部分loss
return
F
.
cross_entropy
(
y_pred_sup
,
y_true_sup
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
MyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
adversarial_train
=
{
'name'
:
'vat'
,
'adv_alpha'
:
1
}
# 虚拟对抗
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
inputs
,
y_true
in
data
:
inputs
=
[
inputs
[
0
][:
y_true
.
size
(
0
)]]
# 仅计算有监督部分
y_pred
=
model
.
predict
(
inputs
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
None
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/tutorials/Tutorials.md
0 → 100644
View file @
19a23d09
# bert4torch使用教程
## 1. 建模流程示例
```
python
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
Callback
,
Logger
,
Tensorboard
,
ListDataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集,可以自己继承Dataset来定义
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""读取文本文件,整理成需要的格式
"""
D
=
[]
return
D
def
collate_fn
(
batch
):
'''处理上述load_data得到的batch数据,整理成对应device上的Tensor
注意:返回值分为feature和label, feature可整理成list或tuple
'''
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
(
'file_path'
),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构,以文本二分类为例
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
768
,
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
# build_transformer_model得到的模型仅接受list/tuple传参,因此入参只有一个时候包装成[token_ids]
hidden_states
,
pooled_output
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
# 可以自定义Loss
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
# 可以自定义优化器
scheduler
=
None
,
# 可以自定义scheduler
metrics
=
[
'accuracy'
]
# 可以自定义回调函数
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存,这里定义仅在epoch结束后调用
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
model
.
save_weights
(
'best_model.pt'
)
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
# 指定训练的epochs,每轮的steps_per_epoch(不设置或者设置为None表示自动计算),梯度累积grad_accumulation_steps
# 使用默认Logger和Tensorboard
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
100
,
grad_accumulation_steps
=
2
,
callbacks
=
[
evaluator
,
Logger
(
'./test/test.log'
),
Tensorboard
(
'./test/'
)])
```
## 2. 主要模块讲解
### 1) 数据处理部分
#### a. 精简词表,并建立分词器
```
python
token_dict
,
keep_tokens
=
load_vocab
(
dict_path
=
dict_path
,
# 词典文件路径
simplified
=
True
,
# 过滤冗余部分token,如[unused1]
startswith
=
[
'[PAD]'
,
'[UNK]'
,
'[CLS]'
,
'[SEP]'
],
# 指定起始的token,如[UNK]从bert默认的103位置调整到1
)
tokenizer
=
Tokenizer
(
token_dict
,
do_lower_case
=
True
)
# 若无需精简,仅使用当前行定义tokenizer即可
```
#### b. 好用的小函数
-
`text_segmentate()`
: 截断总长度至不超过maxlen, 接受多个sequence输入,每次截断最长的句子,indices表示删除的token位置
-
`tokenizer.encode()`
: 把text转成token_ids,默认句首添加[CLS],句尾添加[SEP],返回token_ids和segment_ids,相当于同时调用
`tokenizer.tokenize()`
和
`tokenizer.tokens_to_ids()`
-
`tokenizer.decode()`
: 把token_ids转成text,默认会删除[CLS], [SEP], [UNK]等特殊字符,相当于调用
`tokenizer.ids_to_tokens()`
并做了一些后处理
-
`sequence_padding`
: 将序列padding到同一长度, 传入一个元素为list, ndarray, tensor的list,返回ndarry或tensor
-
`parallel_apply()`
: 多进程或多线程地将func应用到iterable的每个元素中
-
`get_pool_emb()`
: 根据参数设置,多种方式获取句向量
-
`seed_everything()`
: 固定全局seed
### 2) 模型定义部分
-
模型创建
```
python
'''
调用模型后,若设置with_pool, with_nsp, with_mlm,则返回值依次为[hidden_states, pool_emb/nsp_emb, mlm_scores],否则只返回hidden_states
'''
build_transformer_model
(
config_path
=
config_path
,
# 模型的config文件地址
checkpoint_path
=
checkpoint_path
,
# 模型文件地址,默认值None表示不加载预训练模型
model
=
'bert'
,
# 加载的模型结构,这里Model也可以基于nn.Module自定义后传入
application
=
'encoder'
,
# 模型应用,支持encoder,lm和unilm格式
segment_vocab_size
=
2
,
# type_token_ids数量,默认为2,如不传入segment_ids则需设置为0
with_pool
=
False
,
# 是否包含Pool部分
with_nsp
=
False
,
# 是否包含NSP部分
with_mlm
=
False
,
# 是否包含MLM部分
return_model_config
=
False
,
# 是否返回模型配置参数
output_all_encoded_layers
=
False
,
# 是否返回所有hidden_state层
layer_add_embs
=
nn
.
Embedding
(
2
,
768
),
# 自定义额外的embedding输入
)
```
-
定义loss,optimizer,scheduler, metrics等
```
python
'''
定义使用的loss、optimizer和metrics,这里支持自定义
'''
def
eval
(
y_pred
,
y_true
):
# 仅做示意
return
{
'rouge-1'
:
random
.
random
(),
'rouge-2'
:
random
.
random
(),
'rouge-l'
:
random
.
random
(),
'bleu'
:
random
.
random
()}
def
f1
(
y_pred
,
y_true
):
# 仅做示意
return
random
.
random
()
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
# 可以自定义Loss
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
# 可以自定义优化器
scheduler
=
None
,
# 可以自定义scheduler
adversarial_train
=
{
'name'
:
'fgm'
},
# 训练trick方案设置,支持fgm, pgd, gradient_penalty, vat
metrics
=
[
'accuracy'
,
eval
,
{
'f1'
:
f1
}]
# loss等默认打印的字段无需设置,可多种方式自定义回调函数
)
```
-
自定义模型
```
python
'''
基于bert上层的各类魔改,如last2layer_average, token_first_last_average
'''
class
Model
(
BaseModel
):
# 需要继承BaseModel
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
,
checkpoint_path
)
def
forward
(
self
):
pass
```
-
[
自定义训练过程
](
https://github.com/Tongjilibo/bert4torch/blob/master/examples/tutorials/tutorials_custom_fit_progress.py
)
```
python
'''
自定义fit过程,适用于自带fit()不满足需求时
'''
class
Model
(
BaseModel
):
def
fit
(
self
,
train_dataloader
,
steps_per_epoch
,
epochs
):
train_dataloader
=
cycle
(
train_dataloader
)
self
.
train
()
for
epoch
in
range
(
epochs
):
for
bti
in
range
(
steps_per_epoch
):
train_X
,
train_y
=
next
(
train_dataloader
)
output
=
self
.
forward
(
*
train_X
)
loss
=
self
.
criterion
(
output
,
train_y
)
loss
.
backward
()
self
.
optimizer
.
step
()
self
.
optimizer
.
zero_grad
()
```
-
模型保存和加载
```
python
'''
prefix: 是否以原始的key来保存,如word_embedding原始key为bert.embeddings.word_embeddings.weight
默认为None表示不启用, 若基于BaseModel自定义模型,需指定为bert模型对应的成员变量名,直接使用设置为''
主要是为了别的训练框架容易加载
'''
# ====仅进行保存和加载====
model
.
save_weights
(
save_path
,
prefix
=
None
)
# 保存模型权重
model
.
load_weights
(
save_path
)
# 加载模型权重
# =======断点续训========
# 在Callback中的on_epoch_end()或on_batch_end()保存需要的参数
model
.
save_weights
(
save_path
,
prefix
=
None
)
# 保存模型权重
model
.
save_steps_params
(
save_path
)
# 保存训练进度参数,当前的epoch和step,断点续训使用
torch
.
save
(
optimizer
.
state_dict
(),
save_path
)
# 保存优化器,断点续训使用
# 加载前序训练保存的参数
model
.
load_weights
(
save_path
)
# 加载模型权重
model
.
load_steps_params
(
save_path
)
# 加载训练进度参数,断点续训使用
state_dict
=
torch
.
load
(
save_path
,
map_location
=
'cpu'
)
# 加载优化器,断点续训使用
optimizer
.
load_state_dict
(
state_dict
)
```
-
[
加载transformers模型进行训练
](
https://github.com/Tongjilibo/bert4torch/blob/master/examples/tutorials/tutorials_load_transformers_model.py
)
```
python
from
transformers
import
AutoModelForSequenceClassification
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
AutoModelForSequenceClassification
.
from_pretrained
(
"file_path"
,
num_labels
=
2
)
def
forward
(
self
,
token_ids
,
attention_mask
,
segment_ids
):
output
=
self
.
bert
(
input_ids
=
token_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
segment_ids
)
return
output
.
logits
```
### 3) 模型评估部分
```
python
'''支持在多个位置执行
'''
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_dataloader_end
():
# 可用于重新生成dataloader
# 比如多个数据文件时,动态读取一个文件并重新生成dataloader的情况,如预训练
pass
def
on_train_begin
(
self
,
logs
=
None
):
# 训练开始时候
pass
def
on_train_end
(
self
,
logs
=
None
):
# 训练结束时候
pass
def
on_batch_begin
(
self
,
global_step
,
local_step
,
logs
=
None
):
# batch开始时候
pass
def
on_batch_end
(
self
,
global_step
,
local_step
,
logs
=
None
):
# batch结束时候
# 可以设置每隔多少个step,后台记录log,写tensorboard等
# 尽量不要在batch_begin和batch_end中print,防止打断进度条功能
pass
def
on_epoch_begin
(
self
,
global_step
,
epoch
,
logs
=
None
):
# epoch开始时候
pass
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
# epoch结束时候
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
model
.
save_weights
(
'best_model.pt'
)
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
```
## 3. 其他特性讲解
### 1) 单机多卡训练
#### a. 使用DataParallel
```
python
'''DP有两种方式,第一种是forward只计算logit,第二种是forward直接计算loss
建议使用第二种,可以部分缓解负载不均衡的问题
'''
from
bert4torch.models
import
BaseModelDP
# ===========处理数据和定义model===========
model
=
BaseModelDP
(
model
)
# 指定DP模式使用多gpu
model
.
compile
(
loss
=
lambda
x
,
_
:
x
.
mean
(),
# 多个gpu计算的loss的均值
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
```
#### b. 使用DistributedDataParallel
```
python
'''DDP使用torch.distributed.launch,从命令行启动
'''
# 需要定义命令行参数
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
)
args
=
parser
.
parse_args
()
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
'cuda'
,
args
.
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
# ===========处理数据和定义model===========
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model
=
BaseModelDDP
(
model
,
master_rank
=
0
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
False
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
lambda
x
,
_
:
x
,
# 直接把forward计算的loss传出来
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
```
### 2) 日志记录
```
python
# 自行用Tensorboard记录
from
tensorboardX
import
SummaryWriter
class
Evaluator
(
Callback
):
"""每隔多少个step评估并记录tensorboard
"""
def
on_batch_end
(
self
,
global_step
,
local_step
,
logs
=
None
):
if
global_step
%
100
==
0
:
writer
.
add_scalar
(
f
"train/loss"
,
logs
[
'loss'
],
global_step
)
val_acc
=
evaluate
(
valid_dataloader
)
writer
.
add_scalar
(
f
"valid/acc"
,
val_acc
,
global_step
)
# 使用默认的文件Logger和Tensorboard
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
100
,
grad_accumulation_steps
=
2
,
callbacks
=
[
evaluator
,
Logger
(
'./test/test.log'
),
Tensorboard
(
'./test/'
)])
```
### 3) 打印训练参数
```
python
from
torchinfo
import
summary
summary
(
model
,
input_data
=
next
(
iter
(
train_dataloader
))[
0
])
```
\ No newline at end of file
examples/tutorials/tutorials_custom_fit_progress.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 自定义fit()训练过程
from
itertools
import
cycle
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
text_segmentate
,
ListDataset
,
ProgbarLogger
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
maxlen
=
128
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
_
,
pooled_output
=
self
.
bert
([
token_ids
,
segment_ids
])
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
def
fit
(
self
,
train_dataloader
,
steps_per_epoch
,
epochs
=
1
):
'''自定义fit过程:适用于自带fit()不满足需求时,用于自定义训练过程
'''
# 实现进度条展示功能,不需要可以不用
bar
=
ProgbarLogger
(
epochs
,
steps_per_epoch
,
[
'loss'
])
global_step
,
epoch
,
best_val_acc
=
0
,
0
,
0
train_dataloader
=
cycle
(
train_dataloader
)
self
.
train
()
for
epoch
in
range
(
epochs
):
bar
.
on_epoch_begin
(
epoch
=
epoch
)
for
bti
in
range
(
steps_per_epoch
):
bar
.
on_batch_begin
()
train_X
,
train_y
=
next
(
train_dataloader
)
output
=
self
.
forward
(
*
train_X
)
loss
=
self
.
criterion
(
output
,
train_y
)
loss
.
backward
()
self
.
optimizer
.
step
()
self
.
optimizer
.
zero_grad
()
bar
.
on_batch_end
(
logs
=
{
'loss'
:
loss
.
item
()})
# 和上面定义bar时候一致
global_step
+=
1
bar
.
on_epoch_end
()
# 评估
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
best_val_acc
:
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
best_val_acc
:.
5
f
}
\n
'
)
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
if
__name__
==
'__main__'
:
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
100
)
examples/tutorials/tutorials_load_transformers_model.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 调用transformers库中的模型来调用
# 本脚本演示功能为主,实际训练建议两者取其一
# 少量可能使用到的场景:
# 1)bert4torch的fit过程可以轻松使用对抗训练,梯度惩罚,虚拟对抗训练等功能
# 2)就是临时直接用transformers库里面的模型文件
# 3)写代码时候用于校验两者结果
from
transformers
import
AutoModelForSequenceClassification
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
text_segmentate
,
ListDataset
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
maxlen
=
128
batch_size
=
16
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
class
Model
(
BaseModel
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bert
=
AutoModelForSequenceClassification
.
from_pretrained
(
"F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
,
num_labels
=
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
output
=
self
.
bert
(
input_ids
=
token_ids
,
token_type_ids
=
segment_ids
)
return
output
.
logits
model
=
Model
().
to
(
device
)
# 定义使用的loss和optimizer,这里支持自定义
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
),
metrics
=
[
'accuracy'
]
)
# 定义评价函数
def
evaluate
(
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
evaluate
(
valid_dataloader
)
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
if
__name__
==
'__main__'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
20
,
steps_per_epoch
=
100
,
grad_accumulation_steps
=
2
,
callbacks
=
[
evaluator
])
else
:
model
.
load_weights
(
'best_model.pt'
)
examples/tutorials/tutorials_small_tips.py
0 → 100644
View file @
19a23d09
#! -*- coding:utf-8 -*-
# 以文本分类为例,展示部分tips的使用方法
# torchinfo打印参数,自定义metrics, 断点续训,默认Logger和Tensorboard
from
bert4torch.tokenizers
import
Tokenizer
from
bert4torch.models
import
build_transformer_model
,
BaseModel
from
bert4torch.snippets
import
sequence_padding
,
Callback
,
Logger
,
Tensorboard
,
text_segmentate
,
ListDataset
,
seed_everything
,
get_pool_emb
import
torch.nn
as
nn
import
torch
import
torch.optim
as
optim
from
torch.utils.data
import
DataLoader
from
torchinfo
import
summary
import
os
maxlen
=
256
batch_size
=
16
config_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path
=
'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
choice
=
'train'
# train表示训练,infer表示推理
# 固定seed
seed_everything
(
42
)
# 建立分词器
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
# 加载数据集
class
MyDataset
(
ListDataset
):
@
staticmethod
def
load_data
(
filenames
):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D
=
[]
seps
,
strips
=
u
'
\n
。!?!?;;,, '
,
u
';;,, '
for
filename
in
filenames
:
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
for
l
in
f
:
text
,
label
=
l
.
strip
().
split
(
'
\t
'
)
for
t
in
text_segmentate
(
text
,
maxlen
-
2
,
seps
,
strips
):
D
.
append
((
t
,
int
(
label
)))
return
D
def
collate_fn
(
batch
):
batch_token_ids
,
batch_segment_ids
,
batch_labels
=
[],
[],
[]
for
text
,
label
in
batch
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
batch_token_ids
.
append
(
token_ids
)
batch_segment_ids
.
append
(
segment_ids
)
batch_labels
.
append
([
label
])
batch_token_ids
=
torch
.
tensor
(
sequence_padding
(
batch_token_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_segment_ids
=
torch
.
tensor
(
sequence_padding
(
batch_segment_ids
),
dtype
=
torch
.
long
,
device
=
device
)
batch_labels
=
torch
.
tensor
(
batch_labels
,
dtype
=
torch
.
long
,
device
=
device
)
return
[
batch_token_ids
,
batch_segment_ids
],
batch_labels
.
flatten
()
# 加载数据集
train_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'
]),
batch_size
=
batch_size
,
shuffle
=
True
,
collate_fn
=
collate_fn
)
valid_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
test_dataloader
=
DataLoader
(
MyDataset
([
'F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'
]),
batch_size
=
batch_size
,
collate_fn
=
collate_fn
)
# 定义bert上的模型结构
class
Model
(
BaseModel
):
def
__init__
(
self
,
pool_method
=
'cls'
)
->
None
:
super
().
__init__
()
self
.
pool_method
=
pool_method
self
.
bert
=
build_transformer_model
(
config_path
=
config_path
,
checkpoint_path
=
checkpoint_path
,
with_pool
=
True
)
self
.
dropout
=
nn
.
Dropout
(
0.1
)
self
.
dense
=
nn
.
Linear
(
self
.
bert
.
configs
[
'hidden_size'
],
2
)
def
forward
(
self
,
token_ids
,
segment_ids
):
hidden_states
,
pooling
=
self
.
bert
([
token_ids
,
segment_ids
])
pooled_output
=
get_pool_emb
(
hidden_states
,
pooling
,
token_ids
.
gt
(
0
).
long
(),
self
.
pool_method
)
output
=
self
.
dropout
(
pooled_output
)
output
=
self
.
dense
(
output
)
return
output
model
=
Model
().
to
(
device
)
summary
(
model
,
input_data
=
next
(
iter
(
train_dataloader
))[
0
])
def
acc
(
y_pred
,
y_true
):
y_pred
=
torch
.
argmax
(
y_pred
,
dim
=-
1
)
return
torch
.
sum
(
y_pred
.
eq
(
y_true
)).
item
()
/
y_true
.
numel
()
# 定义使用的loss和optimizer,这里支持自定义
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
if
os
.
path
.
exists
(
'last_model.pt'
):
model
.
load_weights
(
'last_model.pt'
)
# 加载模型权重
if
os
.
path
.
exists
(
'last_steps.pt'
):
model
.
load_steps_params
(
'last_steps.pt'
)
# 加载训练进度参数,断点续训使用
if
os
.
path
.
exists
(
'last_optimizer.pt'
):
state_dict
=
torch
.
load
(
'last_optimizer.pt'
,
map_location
=
'cpu'
)
# 加载优化器,断点续训使用
optimizer
.
load_state_dict
(
state_dict
)
model
.
compile
(
loss
=
nn
.
CrossEntropyLoss
(),
optimizer
=
optimizer
,
metrics
=
{
'acc'
:
acc
}
)
class
Evaluator
(
Callback
):
"""评估与保存
"""
def
__init__
(
self
):
self
.
best_val_acc
=
0.
def
on_epoch_end
(
self
,
global_step
,
epoch
,
logs
=
None
):
val_acc
=
self
.
evaluate
(
valid_dataloader
)
test_acc
=
self
.
evaluate
(
test_dataloader
)
logs
[
'val/acc'
]
=
val_acc
logs
[
'test/acc'
]
=
test_acc
if
val_acc
>
self
.
best_val_acc
:
self
.
best_val_acc
=
val_acc
# model.save_weights('best_model.pt')
print
(
f
'val_acc:
{
val_acc
:.
5
f
}
, test_acc:
{
test_acc
:.
5
f
}
, best_val_acc:
{
self
.
best_val_acc
:.
5
f
}
\n
'
)
model
.
save_weights
(
'last_model.pt'
,
prefix
=
None
)
# 保存模型权重
model
.
save_steps_params
(
'last_steps.pt'
)
# 保存训练进度参数,当前的epoch和step,断点续训使用
torch
.
save
(
optimizer
.
state_dict
(),
'last_optimizer.pt'
)
# 保存优化器,断点续训使用
# 定义评价函数
def
evaluate
(
self
,
data
):
total
,
right
=
0.
,
0.
for
x_true
,
y_true
in
data
:
y_pred
=
model
.
predict
(
x_true
).
argmax
(
axis
=
1
)
total
+=
len
(
y_true
)
right
+=
(
y_true
==
y_pred
).
sum
().
item
()
return
right
/
total
def
inference
(
texts
):
'''单条样本推理
'''
for
text
in
texts
:
token_ids
,
segment_ids
=
tokenizer
.
encode
(
text
,
maxlen
=
maxlen
)
token_ids
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
,
device
=
device
)[
None
,
:]
segment_ids
=
torch
.
tensor
(
segment_ids
,
dtype
=
torch
.
long
,
device
=
device
)[
None
,
:]
logit
=
model
.
predict
([
token_ids
,
segment_ids
])
y_pred
=
torch
.
argmax
(
torch
.
softmax
(
logit
,
dim
=-
1
)).
cpu
().
numpy
()
print
(
text
,
' ----> '
,
y_pred
)
if
__name__
==
'__main__'
:
if
choice
==
'train'
:
evaluator
=
Evaluator
()
model
.
fit
(
train_dataloader
,
epochs
=
10
,
steps_per_epoch
=
100
,
callbacks
=
[
evaluator
,
Logger
(
'test.log'
),
Tensorboard
(
'./'
)])
else
:
model
.
load_weights
(
'best_model.pt'
)
inference
([
'我今天特别开心'
,
'我今天特别生气'
])
setup.py
0 → 100644
View file @
19a23d09
#! -*- coding: utf-8 -*-
from
setuptools
import
setup
,
find_packages
setup
(
name
=
'bert4torch'
,
version
=
'0.2.2'
,
description
=
'an elegant bert4torch'
,
long_description
=
'bert4torch: https://github.com/Tongjilibo/bert4torch'
,
license
=
'MIT Licence'
,
url
=
'https://github.com/Tongjilibo/bert4torch'
,
author
=
'Tongjilibo'
,
install_requires
=
[
'torch>1.6'
],
packages
=
find_packages
()
)
\ No newline at end of file
Prev
1
…
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment