Commit 19a23d09 authored by wangsen's avatar wangsen
Browse files

Initial commit

parents
Pipeline #1247 failed with stages
in 0 seconds
#! -*- coding:utf-8 -*-
# 通过TemporalEnsembling提升模型泛化
# 官方项目:https://github.com/s-laine/tempens
# pytorch第三方实现:https://github.com/ferretj/temporal-ensembling
# 数据集:情感分类数据集
# 本示例是把监督数据当成无监督数据使用
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything, text_segmentate, get_pool_emb
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import TemporalEnsemblingLoss
maxlen = 256
batch_size = 16
epochs = 10
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集,训练数据集shuffle=False
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert= build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class MyLoss(TemporalEnsemblingLoss):
def forward(self, y_pred, y_true):
# 监督数据当成无监督数据使用,真实场景中可以用大量的无监督数据来使用
y_pred_sup, y_pred_unsup, y_true_sup = y_pred, y_pred, y_true
return super().forward(y_pred_sup, y_pred_unsup, y_true_sup, model.epoch, model.bti)
loss = MyLoss(epochs=epochs, max_batch_num=None)
model.compile(loss=loss, optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=['accuracy'])
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 以文本分类(情感分类)为例的半监督学习UDA策略,https://arxiv.org/abs/1904.12848
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
from bert4torch.losses import UDALoss
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import random
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset = [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
def collate_fn(batch):
def add_noise(token_ids, del_ratio=0.3):
'''这里用随机删除做简单示例,实际中可以使用增删改等多种noise方案
'''
n = len(token_ids)
keep_or_not = np.random.rand(n) > del_ratio
if sum(keep_or_not) == 0:
keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
return list(np.array(token_ids)[keep_or_not])
# batch_token_ids包含三部分,第一部分是有监督数据,第二部分是领域类的无监督数据,第三部分是无监督数据经数据增强后的数据
batch_token_ids, batch_labels = [[], [], []], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids[0].append(token_ids)
batch_labels.append([label])
# 无监督部分
unsup_text = random.choice(unsup_dataset) # 随机挑一个无监督数据
token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
batch_token_ids[1].append(token_ids)
batch_token_ids[2].append(token_ids[:1] + add_noise(token_ids[1:-1]) + token_ids[-1:]) # 无监督数据增强
batch_token_ids = [j for i in batch_token_ids for j in i]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class Loss(UDALoss):
def forward(self, y_pred, y_true_sup):
loss, loss_sup, loss_unsup = super().forward(y_pred, y_true_sup, model.global_step, model.total_steps)
return {'loss': loss, 'loss_sup': loss_sup, 'loss_unsup': loss_unsup}
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=Loss(tsa_schedule='linear_schedule', start_p=0.8), # 这里可换用不同的策略, 不为None时候要给定model
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['loss_sup', 'loss_unsup'] # Loss返回的key会自动计入metrics,下述metrics不写仍可以打印具体的Loss
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for token_ids, y_true in data:
y_pred = model.predict(token_ids[:y_true.size(0)]).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 通过对抗训/梯度惩罚练增强模型的泛化性能,包含fgm, pgs, vat,梯度惩罚
# 数据集:情感分类数据集
# 对抗训练:https://kexue.fm/archives/7234
# 虚拟对抗训练:https://kexue.fm/archives/7466
# 梯度惩罚:https://kexue.fm/archives/7234
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, get_pool_emb, seed_everything
from bert4torch.tokenizers import Tokenizer
import sys
maxlen = 256
batch_size = 16
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 传参方式
mode = sys.argv[1]
adversarial_train = {'name': mode}
print(f'Using {mode}'.center(60, '='))
# debug方式
# 具体参数设置可以到bert4torch.models/bert4torch.snippets里
# adversarial_train = {'name': 'fgm'} # fgm方式
# adversarial_train = {'name': 'pgd'} # pgd方式
# adversarial_train = {'name': 'gradient_penalty'} # 梯度惩罚
# adversarial_train = {'name': 'vat'} # 虚拟对抗,这里仅为使用有监督数据的示例
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy'], adversarial_train=adversarial_train)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 指数滑动平均
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
from bert4torch.optimizers import extend_with_exponential_moving_average
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
ema_schedule = extend_with_exponential_moving_average(model, decay=0.99)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
scheduler=ema_schedule,
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
ema_schedule.apply_ema_weights() # 使用滑动平均的ema权重
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
ema_schedule.restore_raw_weights() # 恢复原来模型的参数
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 指数滑动平均ema+warmup两种策略
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
from bert4torch.optimizers import extend_with_exponential_moving_average, get_linear_schedule_with_warmup
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
ema_schedule = extend_with_exponential_moving_average(model, decay=0.99)
warmup_scheduler = get_linear_schedule_with_warmup(optimizer, len(train_dataloader), num_training_steps=len(train_dataloader)*10, last_epoch=-1)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optimizer,
scheduler=[ema_schedule, warmup_scheduler],
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
ema_schedule.apply_ema_weights() # 使用滑动平均的ema权重
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
ema_schedule.restore_raw_weights() # 恢复原来模型的参数
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# Mixup策略,包含embedding,hidden, encoder的mixup
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.layers import MixUp
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
choice = 'train' # train表示训练,infer表示推理
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, mixup_method='encoder', pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
self.mixup = MixUp(method=mixup_method)
def forward(self, token_ids):
hidden_states, pooling = self.mixup.encode(self.bert, [token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
y_pred = self.dense(output)
return y_pred
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
y_pred = self.dense(output)
return y_pred
model = Model().to(device)
class Loss(nn.Module):
def forward(self, y_pred, y_true):
return model.mixup(nn.CrossEntropyLoss(), y_pred, y_true)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=Loss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 以文本分类为例的半监督学习,虚拟对抗训练策略
# 监督数据部分只计算监督Loss, 有监督+无监督数据计算对抗训练的Loss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import random
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset = [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
def collate_fn(batch):
# batch_token_ids包含两部部分,第一部分是有监督数据,第二部分是无监督数据
batch_token_ids, batch_labels = [[], []], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids[0].append(token_ids)
batch_labels.append([label])
# 无监督部分
unsup_text = random.choice(unsup_dataset) # 随机挑一个无监督数据
token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
batch_token_ids[1].append(token_ids)
batch_token_ids = [j for i in batch_token_ids for j in i]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids[0].gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class MyLoss(nn.Module):
def forward(self, y_pred, y_true_sup):
y_pred_sup = y_pred[:y_true_sup.shape[0]] # 仅计算监督部分loss
return F.cross_entropy(y_pred_sup, y_true_sup)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=MyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
adversarial_train = {'name': 'vat', 'adv_alpha': 1} # 虚拟对抗
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for inputs, y_true in data:
inputs = [inputs[0][:y_true.size(0)]] # 仅计算有监督部分
y_pred = model.predict(inputs).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
# bert4torch使用教程
## 1. 建模流程示例
```python
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import Callback, Logger, Tensorboard, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集,可以自己继承Dataset来定义
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""读取文本文件,整理成需要的格式
"""
D = []
return D
def collate_fn(batch):
'''处理上述load_data得到的batch数据,整理成对应device上的Tensor
注意:返回值分为feature和label, feature可整理成list或tuple
'''
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset('file_path'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义bert上的模型结构,以文本二分类为例
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(768, 2)
def forward(self, token_ids, segment_ids):
# build_transformer_model得到的模型仅接受list/tuple传参,因此入参只有一个时候包装成[token_ids]
hidden_states, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(), # 可以自定义Loss
optimizer=optim.Adam(model.parameters(), lr=2e-5), # 可以自定义优化器
scheduler=None, # 可以自定义scheduler
metrics=['accuracy'] # 可以自定义回调函数
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存,这里定义仅在epoch结束后调用
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
# 指定训练的epochs,每轮的steps_per_epoch(不设置或者设置为None表示自动计算),梯度累积grad_accumulation_steps
# 使用默认Logger和Tensorboard
model.fit(train_dataloader, epochs=20, steps_per_epoch=100, grad_accumulation_steps=2,
callbacks=[evaluator, Logger('./test/test.log'), Tensorboard('./test/')])
```
## 2. 主要模块讲解
### 1) 数据处理部分
#### a. 精简词表,并建立分词器
```python
token_dict, keep_tokens = load_vocab(
dict_path=dict_path, # 词典文件路径
simplified=True, # 过滤冗余部分token,如[unused1]
startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], # 指定起始的token,如[UNK]从bert默认的103位置调整到1
)
tokenizer = Tokenizer(token_dict, do_lower_case=True) # 若无需精简,仅使用当前行定义tokenizer即可
```
#### b. 好用的小函数
- `text_segmentate()`: 截断总长度至不超过maxlen, 接受多个sequence输入,每次截断最长的句子,indices表示删除的token位置
- `tokenizer.encode()`: 把text转成token_ids,默认句首添加[CLS],句尾添加[SEP],返回token_ids和segment_ids,相当于同时调用`tokenizer.tokenize()``tokenizer.tokens_to_ids()`
- `tokenizer.decode()`: 把token_ids转成text,默认会删除[CLS], [SEP], [UNK]等特殊字符,相当于调用`tokenizer.ids_to_tokens()`并做了一些后处理
- `sequence_padding`: 将序列padding到同一长度, 传入一个元素为list, ndarray, tensor的list,返回ndarry或tensor
- `parallel_apply()`: 多进程或多线程地将func应用到iterable的每个元素中
- `get_pool_emb()`: 根据参数设置,多种方式获取句向量
- `seed_everything()`: 固定全局seed
### 2) 模型定义部分
- 模型创建
```python
'''
调用模型后,若设置with_pool, with_nsp, with_mlm,则返回值依次为[hidden_states, pool_emb/nsp_emb, mlm_scores],否则只返回hidden_states
'''
build_transformer_model(
config_path=config_path, # 模型的config文件地址
checkpoint_path=checkpoint_path, # 模型文件地址,默认值None表示不加载预训练模型
model='bert', # 加载的模型结构,这里Model也可以基于nn.Module自定义后传入
application='encoder', # 模型应用,支持encoder,lm和unilm格式
segment_vocab_size=2, # type_token_ids数量,默认为2,如不传入segment_ids则需设置为0
with_pool=False, # 是否包含Pool部分
with_nsp=False, # 是否包含NSP部分
with_mlm=False, # 是否包含MLM部分
return_model_config=False, # 是否返回模型配置参数
output_all_encoded_layers=False, # 是否返回所有hidden_state层
layer_add_embs=nn.Embedding(2, 768), # 自定义额外的embedding输入
)
```
- 定义loss,optimizer,scheduler, metrics等
```python
'''
定义使用的loss、optimizer和metrics,这里支持自定义
'''
def eval(y_pred, y_true):
# 仅做示意
return {'rouge-1': random.random(), 'rouge-2': random.random(), 'rouge-l': random.random(), 'bleu': random.random()}
def f1(y_pred, y_true):
# 仅做示意
return random.random()
model.compile(
loss=nn.CrossEntropyLoss(), # 可以自定义Loss
optimizer=optim.Adam(model.parameters(), lr=2e-5), # 可以自定义优化器
scheduler=None, # 可以自定义scheduler
adversarial_train={'name': 'fgm'}, # 训练trick方案设置,支持fgm, pgd, gradient_penalty, vat
metrics=['accuracy', eval, {'f1': f1}] # loss等默认打印的字段无需设置,可多种方式自定义回调函数
)
```
- 自定义模型
```python
'''
基于bert上层的各类魔改,如last2layer_average, token_first_last_average
'''
class Model(BaseModel):
# 需要继承BaseModel
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path, checkpoint_path)
def forward(self):
pass
```
- [自定义训练过程](https://github.com/Tongjilibo/bert4torch/blob/master/examples/tutorials/tutorials_custom_fit_progress.py)
```python
'''
自定义fit过程,适用于自带fit()不满足需求时
'''
class Model(BaseModel):
def fit(self, train_dataloader, steps_per_epoch, epochs):
train_dataloader = cycle(train_dataloader)
self.train()
for epoch in range(epochs):
for bti in range(steps_per_epoch):
train_X, train_y = next(train_dataloader)
output = self.forward(*train_X)
loss = self.criterion(output, train_y)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
```
- 模型保存和加载
```python
'''
prefix: 是否以原始的key来保存,如word_embedding原始key为bert.embeddings.word_embeddings.weight
默认为None表示不启用, 若基于BaseModel自定义模型,需指定为bert模型对应的成员变量名,直接使用设置为''
主要是为了别的训练框架容易加载
'''
# ====仅进行保存和加载====
model.save_weights(save_path, prefix=None) # 保存模型权重
model.load_weights(save_path) # 加载模型权重
# =======断点续训========
# 在Callback中的on_epoch_end()或on_batch_end()保存需要的参数
model.save_weights(save_path, prefix=None) # 保存模型权重
model.save_steps_params(save_path) # 保存训练进度参数,当前的epoch和step,断点续训使用
torch.save(optimizer.state_dict(), save_path) # 保存优化器,断点续训使用
# 加载前序训练保存的参数
model.load_weights(save_path) # 加载模型权重
model.load_steps_params(save_path) # 加载训练进度参数,断点续训使用
state_dict = torch.load(save_path, map_location='cpu') # 加载优化器,断点续训使用
optimizer.load_state_dict(state_dict)
```
- [加载transformers模型进行训练](https://github.com/Tongjilibo/bert4torch/blob/master/examples/tutorials/tutorials_load_transformers_model.py)
```python
from transformers import AutoModelForSequenceClassification
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = AutoModelForSequenceClassification.from_pretrained("file_path", num_labels=2)
def forward(self, token_ids, attention_mask, segment_ids):
output = self.bert(input_ids=token_ids, attention_mask=attention_mask, token_type_ids=segment_ids)
return output.logits
```
### 3) 模型评估部分
```python
'''支持在多个位置执行
'''
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_dataloader_end():
# 可用于重新生成dataloader
# 比如多个数据文件时,动态读取一个文件并重新生成dataloader的情况,如预训练
pass
def on_train_begin(self, logs=None): # 训练开始时候
pass
def on_train_end(self, logs=None): # 训练结束时候
pass
def on_batch_begin(self, global_step, local_step, logs=None): # batch开始时候
pass
def on_batch_end(self, global_step, local_step, logs=None): # batch结束时候
# 可以设置每隔多少个step,后台记录log,写tensorboard等
# 尽量不要在batch_begin和batch_end中print,防止打断进度条功能
pass
def on_epoch_begin(self, global_step, epoch, logs=None): # epoch开始时候
pass
def on_epoch_end(self, global_step, epoch, logs=None): # epoch结束时候
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
```
## 3. 其他特性讲解
### 1) 单机多卡训练
#### a. 使用DataParallel
```python
'''DP有两种方式,第一种是forward只计算logit,第二种是forward直接计算loss
建议使用第二种,可以部分缓解负载不均衡的问题
'''
from bert4torch.models import BaseModelDP
# ===========处理数据和定义model===========
model = BaseModelDP(model) # 指定DP模式使用多gpu
model.compile(
loss=lambda x, _: x.mean(), # 多个gpu计算的loss的均值
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
```
#### b. 使用DistributedDataParallel
```python
'''DDP使用torch.distributed.launch,从命令行启动
'''
# 需要定义命令行参数
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=-1)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
device = torch.device('cuda', args.local_rank)
torch.distributed.init_process_group(backend='nccl')
# ===========处理数据和定义model===========
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=lambda x, _: x, # 直接把forward计算的loss传出来
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
```
### 2) 日志记录
```python
# 自行用Tensorboard记录
from tensorboardX import SummaryWriter
class Evaluator(Callback):
"""每隔多少个step评估并记录tensorboard
"""
def on_batch_end(self, global_step, local_step, logs=None):
if global_step % 100 == 0:
writer.add_scalar(f"train/loss", logs['loss'], global_step)
val_acc = evaluate(valid_dataloader)
writer.add_scalar(f"valid/acc", val_acc, global_step)
# 使用默认的文件Logger和Tensorboard
model.fit(train_dataloader, epochs=20, steps_per_epoch=100, grad_accumulation_steps=2,
callbacks=[evaluator, Logger('./test/test.log'), Tensorboard('./test/')])
```
### 3) 打印训练参数
```python
from torchinfo import summary
summary(model, input_data=next(iter(train_dataloader))[0])
```
\ No newline at end of file
#! -*- coding:utf-8 -*-
# 自定义fit()训练过程
from itertools import cycle
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, ProgbarLogger
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 128
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
def fit(self, train_dataloader, steps_per_epoch, epochs=1):
'''自定义fit过程:适用于自带fit()不满足需求时,用于自定义训练过程
'''
# 实现进度条展示功能,不需要可以不用
bar = ProgbarLogger(epochs, steps_per_epoch, ['loss'])
global_step, epoch, best_val_acc = 0, 0, 0
train_dataloader = cycle(train_dataloader)
self.train()
for epoch in range(epochs):
bar.on_epoch_begin(epoch=epoch)
for bti in range(steps_per_epoch):
bar.on_batch_begin()
train_X, train_y = next(train_dataloader)
output = self.forward(*train_X)
loss = self.criterion(output, train_y)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
bar.on_batch_end(logs={'loss': loss.item()}) # 和上面定义bar时候一致
global_step += 1
bar.on_epoch_end()
# 评估
val_acc = evaluate(valid_dataloader)
if val_acc > best_val_acc:
best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {best_val_acc:.5f}\n')
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=100)
#! -*- coding:utf-8 -*-
# 调用transformers库中的模型来调用
# 本脚本演示功能为主,实际训练建议两者取其一
# 少量可能使用到的场景:
# 1)bert4torch的fit过程可以轻松使用对抗训练,梯度惩罚,虚拟对抗训练等功能
# 2)就是临时直接用transformers库里面的模型文件
# 3)写代码时候用于校验两者结果
from transformers import AutoModelForSequenceClassification
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 128
batch_size = 16
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = AutoModelForSequenceClassification.from_pretrained("F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12", num_labels=2)
def forward(self, token_ids, segment_ids):
output = self.bert(input_ids=token_ids, token_type_ids=segment_ids)
return output.logits
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy']
)
# 定义评价函数
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = evaluate(valid_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=20, steps_per_epoch=100, grad_accumulation_steps=2, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 以文本分类为例,展示部分tips的使用方法
# torchinfo打印参数,自定义metrics, 断点续训,默认Logger和Tensorboard
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, Logger, Tensorboard, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchinfo import summary
import os
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
choice = 'train' # train表示训练,infer表示推理
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
hidden_states, pooling = self.bert([token_ids, segment_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
summary(model, input_data=next(iter(train_dataloader))[0])
def acc(y_pred, y_true):
y_pred = torch.argmax(y_pred, dim=-1)
return torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
# 定义使用的loss和optimizer,这里支持自定义
optimizer = optim.Adam(model.parameters(), lr=2e-5)
if os.path.exists('last_model.pt'):
model.load_weights('last_model.pt') # 加载模型权重
if os.path.exists('last_steps.pt'):
model.load_steps_params('last_steps.pt') # 加载训练进度参数,断点续训使用
if os.path.exists('last_optimizer.pt'):
state_dict = torch.load('last_optimizer.pt', map_location='cpu') # 加载优化器,断点续训使用
optimizer.load_state_dict(state_dict)
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optimizer,
metrics={'acc': acc}
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
logs['val/acc'] = val_acc
logs['test/acc'] = test_acc
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
model.save_weights('last_model.pt', prefix=None) # 保存模型权重
model.save_steps_params('last_steps.pt') # 保存训练进度参数,当前的epoch和step,断点续训使用
torch.save(optimizer.state_dict(), 'last_optimizer.pt') # 保存优化器,断点续训使用
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
def inference(texts):
'''单条样本推理
'''
for text in texts:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]
logit = model.predict([token_ids, segment_ids])
y_pred = torch.argmax(torch.softmax(logit, dim=-1)).cpu().numpy()
print(text, ' ----> ', y_pred)
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=100, callbacks=[evaluator, Logger('test.log'), Tensorboard('./')])
else:
model.load_weights('best_model.pt')
inference(['我今天特别开心', '我今天特别生气'])
#! -*- coding: utf-8 -*-
from setuptools import setup, find_packages
setup(
name='bert4torch',
version='0.2.2',
description='an elegant bert4torch',
long_description='bert4torch: https://github.com/Tongjilibo/bert4torch',
license='MIT Licence',
url='https://github.com/Tongjilibo/bert4torch',
author='Tongjilibo',
install_requires=['torch>1.6'],
packages=find_packages()
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment