task_sentence_embedding_model_distillation.py 2.45 KB
Newer Older
wangsen's avatar
wangsen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#! -*- coding:utf-8 -*-
# 模型压缩,仅保留bert-base部分层
# 初测测试指标从80%降到77%左右,未细测

from task_sentence_embedding_sup_CosineMSELoss import model, train_dataloader, Model, device, valid_dataloader, evaluate
from bert4torch.snippets import Callback, get_pool_emb
import torch.optim as optim
import torch.nn as nn
from bert4torch.models import build_transformer_model


train_token_ids, train_embeddings = [], []
for token_ids_list, labels in train_dataloader:
    train_token_ids.extend(token_ids_list)
    for token_ids in token_ids_list:
        train_embeddings.append(model.encode(token_ids))
    # if len(train_embeddings) >= 20:
    #     break

new_train_dataloader = list(zip(train_token_ids, train_embeddings))
print('train_embeddings done, start model distillation...')


# 仅取固定的层
class NewModel(Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
        self.bert = build_transformer_model(config_path=config_path, with_pool=True, segment_vocab_size=0, keep_hidden_layers=[1,4,7])

    def forward(self, token_ids):
        hidden_state, pooler = self.bert([token_ids])
        attention_mask = token_ids.gt(0).long()
        output = get_pool_emb(hidden_state, pooler, attention_mask, self.pool_method)
        return output

new_model = NewModel().to(device)
new_model.compile(
    loss=nn.MSELoss(),
    optimizer=optim.Adam(new_model.parameters(), lr=2e-5),
)
new_model.load_weights('best_model.pt', strict=False)  # 加载大模型的部分层
val_consine = evaluate(new_model, valid_dataloader)
print('init val_cosine after distillation: ', val_consine)

class Evaluator(Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_consine = 0.

    def on_epoch_end(self, global_step, epoch, logs=None):
        val_consine = evaluate(new_model, valid_dataloader)
        if val_consine > self.best_val_consine:
            self.best_val_consine = val_consine
            # new_model.save_weights('best_model.pt')
        print(f'val_consine: {val_consine:.5f}, best_val_consine: {self.best_val_consine:.5f}\n')


if __name__ == '__main__':
    evaluator = Evaluator()
    new_model.fit(new_train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
else:
    new_model.load_weights('best_model.pt')