提交初版bert4torch project

66a1d0d0 · yangzhong · 66a1d0d0 · 66a1d0d0 · 66a1d0d0 · 66a1d0d0
Commit 66a1d0d0 authored Aug 22, 2023 by yangzhong
20 changed files
--- a/examples/sequence_labeling/uie/finetune.sh
+++ b/examples/sequence_labeling/uie/finetune.sh
+# 数据生成1
+python finetune_step1_dataprocess.py
+# 数据生成2
+python finetune_step2_doccano.py \
+--doccano_file ./data/mid_data/train.json \
+--task_type "ext" \
+--splits 1.0 0.0 0.0 \
+--save_dir ./data/final_data/ \
+--negative_ratio 3
+python finetune_step2_doccano.py \
+--doccano_file ./data/mid_data/dev.json \
+--task_type "ext" \
+--splits 0.0 1.0 0.0 \
+--save_dir ./data/final_data/ \
+--negative_ratio 0
+python finetune_step2_doccano.py \
+--doccano_file ./data/mid_data/test.json \
+--task_type "ext" \
+--splits 0.0 0.0 1.0 \
+--save_dir ./data/final_data/ \
+--negative_ratio 0
+# finetune训练
+python finetune_step3_train.py
\ No newline at end of file
--- a/examples/sequence_labeling/uie/finetune_step1_dataprocess.py
+++ b/examples/sequence_labeling/uie/finetune_step1_dataprocess.py
+# 数据转换1
+import os
+import re
+import json
+en2ch = {
+  'ORG':'机构', 
+  'PER':'人名', 
+  'LOC':'籍贯'
+}
+def preprocess(input_path, save_path, mode):
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    data_path = os.path.join(save_path, mode + ".json")
+    result = []
+    tmp = {}
+    tmp['id'] = 0
+    tmp['text'] = ''
+    tmp['relations'] = []
+    tmp['entities'] = []
+    # =======先找出句子和句子中的所有实体和类型=======
+    with open(input_path,'r',encoding='utf-8') as fp:
+        lines = fp.readlines()
+        texts = []
+        entities = []
+        words = []
+        entity_tmp = []
+        entities_tmp = []
+        entity_label = ''
+        for line in lines:
+            line = line.strip().split(" ")
+            if len(line) == 2:
+                word = line[0]
+                label = line[1]
+                words.append(word)
+                if "B-" in label:
+                    entity_tmp.append(word)
+                    entity_label = en2ch[label.split("-")[-1]]
+                elif "I-" in label:
+                    entity_tmp.append(word)
+                if (label == 'O') and entity_tmp:
+                    if ("".join(entity_tmp), entity_label) not in entities_tmp:
+                        entities_tmp.append(("".join(entity_tmp), entity_label))
+                    entity_tmp, entity_label = [], ''
+            else:
+                if entity_tmp and (("".join(entity_tmp), entity_label) not in entities_tmp):
+                    entities_tmp.append(("".join(entity_tmp), entity_label))
+                    entity_tmp, entity_label = [], ''
+                texts.append("".join(words))
+                entities.append(entities_tmp)
+                words = []
+                entities_tmp = []
+    # ==========================================
+    # =======找出句子中实体的位置=======
+    i = 0
+    for text,entity in zip(texts, entities):
+        if entity:
+            ltmp = []
+            for ent,type in entity:
+                for span in re.finditer(ent, text):
+                    start = span.start()
+                    end = span.end()
+                    ltmp.append((type, start, end, ent))
+                    # print(ltmp)
+            ltmp = sorted(ltmp, key=lambda x:(x[1],x[2]))
+            for j in range(len(ltmp)):
+                # tmp['entities'].append(["".format(str(j)), ltmp[j][0], ltmp[j][1], ltmp[j][2], ltmp[j][3]])
+                tmp['entities'].append({"id":j, "start_offset":ltmp[j][1], "end_offset":ltmp[j][2], "label":ltmp[j][0]})
+        else:
+            tmp['entities'] = []
+        tmp['id'] = i
+        tmp['text'] = text
+        result.append(tmp)
+        tmp = {}
+        tmp['id'] = 0
+        tmp['text'] = ''
+        tmp['relations'] = []
+        tmp['entities'] = []
+        i += 1
+    with open(data_path, 'w', encoding='utf-8') as fp:
+        fp.write("\n".join([json.dumps(i, ensure_ascii=False) for i in result]))
+preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train", './data/mid_data', "train")
+preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev", './data/mid_data', "dev")
+preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.test", './data/mid_data', "test")
\ No newline at end of file
--- a/examples/sequence_labeling/uie/finetune_step2_doccano.py
+++ b/examples/sequence_labeling/uie/finetune_step2_doccano.py
+# 数据生成step2
+import os
+import time
+import argparse
+import json
+from decimal import Decimal
+import numpy as np
+from bert4torch.snippets import seed_everything
+from utils import convert_ext_examples, convert_cls_examples, logger
+def do_convert():
+    seed_everything(args.seed)
+    tic_time = time.time()
+    if not os.path.exists(args.doccano_file):
+        raise ValueError("Please input the correct path of doccano file.")
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    if len(args.splits) != 0 and len(args.splits) != 3:
+        raise ValueError("Only []/ len(splits)==3 accepted for splits.")
+    def _check_sum(splits):
+        return Decimal(str(splits[0])) + Decimal(str(splits[1])) + Decimal(
+            str(splits[2])) == Decimal("1")
+    if len(args.splits) == 3 and not _check_sum(args.splits):
+        raise ValueError(
+            "Please set correct splits, sum of elements in splits should be equal to 1."
+        )
+    with open(args.doccano_file, "r", encoding="utf-8") as f:
+        raw_examples = f.readlines()
+    def _create_ext_examples(examples,
+                             negative_ratio=0,
+                             shuffle=False,
+                             is_train=True):
+        entities, relations = convert_ext_examples(
+            examples, negative_ratio, is_train=is_train)
+        examples = entities + relations
+        if shuffle:
+            indexes = np.random.permutation(len(examples))
+            examples = [examples[i] for i in indexes]
+        return examples
+    def _create_cls_examples(examples, prompt_prefix, options, shuffle=False):
+        examples = convert_cls_examples(examples, prompt_prefix, options)
+        if shuffle:
+            indexes = np.random.permutation(len(examples))
+            examples = [examples[i] for i in indexes]
+        return examples
+    def _save_examples(save_dir, file_name, examples):
+        count = 0
+        save_path = os.path.join(save_dir, file_name)
+        if not examples:
+            logger.info("Skip saving %d examples to %s." % (0, save_path))
+            return
+        with open(save_path, "w", encoding="utf-8") as f:
+            for example in examples:
+                f.write(json.dumps(example, ensure_ascii=False) + "\n")
+                count += 1
+        logger.info("Save %d examples to %s." % (count, save_path))
+    if len(args.splits) == 0:
+        if args.task_type == "ext":
+            examples = _create_ext_examples(raw_examples, args.negative_ratio,
+                                            args.is_shuffle)
+        else:
+            examples = _create_cls_examples(raw_examples, args.prompt_prefix,
+                                            args.options, args.is_shuffle)
+        _save_examples(args.save_dir, "train.txt", examples)
+    else:
+        if args.is_shuffle:
+            indexes = np.random.permutation(len(raw_examples))
+            raw_examples = [raw_examples[i] for i in indexes]
+        i1, i2, _ = args.splits
+        p1 = int(len(raw_examples) * i1)
+        p2 = int(len(raw_examples) * (i1 + i2))
+        if args.task_type == "ext":
+            train_examples = _create_ext_examples(
+                raw_examples[:p1], args.negative_ratio, args.is_shuffle)
+            dev_examples = _create_ext_examples(
+                raw_examples[p1:p2], -1, is_train=False)
+            test_examples = _create_ext_examples(
+                raw_examples[p2:], -1, is_train=False)
+        else:
+            train_examples = _create_cls_examples(
+                raw_examples[:p1], args.prompt_prefix, args.options)
+            dev_examples = _create_cls_examples(
+                raw_examples[p1:p2], args.prompt_prefix, args.options)
+            test_examples = _create_cls_examples(
+                raw_examples[p2:], args.prompt_prefix, args.options)
+        _save_examples(args.save_dir, "train.txt", train_examples)
+        _save_examples(args.save_dir, "dev.txt", dev_examples)
+        _save_examples(args.save_dir, "test.txt", test_examples)
+    logger.info('Finished! It takes %.2f seconds' % (time.time() - tic_time))
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--doccano_file", default="./data/doccano.json",
+                        type=str, help="The doccano file exported from doccano platform.")
+    parser.add_argument("-s", "--save_dir", default="./data",
+                        type=str, help="The path of data that you wanna save.")
+    parser.add_argument("--negative_ratio", default=5, type=int,
+                        help="Used only for the extraction task, the ratio of positive and negative samples, number of negtive samples = negative_ratio * number of positive samples")
+    parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*",
+                        help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60%% samples used for training, 20%% for evaluation and 20%% for test.")
+    parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str,
+                        help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")
+    parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+",
+                        help="Used only for the classification task, the options for classification")
+    parser.add_argument("--prompt_prefix", default="情感倾向", type=str,
+                        help="Used only for the classification task, the prompt prefix for classification")
+    parser.add_argument("--is_shuffle", default=True, type=bool,
+                        help="Whether to shuffle the labeled dataset, defaults to True.")
+    parser.add_argument("--seed", type=int, default=1000,
+                        help="random seed for initialization")
+    args = parser.parse_args()
+    do_convert()
--- a/examples/sequence_labeling/uie/finetune_step3_train.py
+++ b/examples/sequence_labeling/uie/finetune_step3_train.py
+import torch
+from torch.utils.data import DataLoader
+from model import uie_model, tokenizer
+from bert4torch.snippets import seed_everything, sequence_padding, Callback
+from torch import nn
+from torch.utils.data import Dataset
+import numpy as np
+import json
+from utils import get_bool_ids_greater_than, get_span
+from random import sample
+batch_size = 16
+learning_rate = 1e-5
+train_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/train.txt'
+dev_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/dev.txt'
+save_dir = './'
+max_seq_len = 256
+num_epochs = 10
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+uie_model.to(device)
+class IEDataset(Dataset):
+    """信息抽取
+    """
+    def __init__(self, file_path, tokenizer, max_seq_len, fewshot=None) -> None:
+        super().__init__()
+        self.file_path = file_path
+        if fewshot is None:
+            self.dataset = list(self.reader(file_path))
+        else:
+            assert isinstance(fewshot, int)
+            self.dataset = sample(list(self.reader(file_path)), fewshot)
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, index):
+        return self.dataset[index]
+    @staticmethod
+    def reader(data_path, max_seq_len=512):
+        """read json
+        """
+        with open(data_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                json_line = json.loads(line)
+                content = json_line['content']
+                prompt = json_line['prompt']
+                # Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
+                # It include three summary tokens.
+                if max_seq_len <= len(prompt) + 3:
+                    raise ValueError("The value of max_seq_len is too small, please set a larger value")
+                max_content_len = max_seq_len - len(prompt) - 3
+                if len(content) <= max_content_len:
+                    yield json_line
+                else:
+                    result_list = json_line['result_list']
+                    json_lines = []
+                    accumulate = 0
+                    while True:
+                        cur_result_list = []
+                        for result in result_list:
+                            if result['start'] + 1 <= max_content_len < result['end']:
+                                max_content_len = result['start']
+                                break
+                        cur_content = content[:max_content_len]
+                        res_content = content[max_content_len:]
+                        while True:
+                            if len(result_list) == 0:
+                                break
+                            elif result_list[0]['end'] <= max_content_len:
+                                if result_list[0]['end'] > 0:
+                                    cur_result = result_list.pop(0)
+                                    cur_result_list.append(cur_result)
+                                else:
+                                    cur_result_list = [result for result in result_list]
+                                    break
+                            else:
+                                break
+                        json_line = {'content': cur_content, 'result_list': cur_result_list, 'prompt': prompt}
+                        json_lines.append(json_line)
+                        for result in result_list:
+                            if result['end'] <= 0:
+                                break
+                            result['start'] -= max_content_len
+                            result['end'] -= max_content_len
+                        accumulate += max_content_len
+                        max_content_len = max_seq_len - len(prompt) - 3
+                        if len(res_content) == 0:
+                            break
+                        elif len(res_content) < max_content_len:
+                            json_line = {'content': res_content, 'result_list': result_list, 'prompt': prompt}
+                            json_lines.append(json_line)
+                            break
+                        else:
+                            content = res_content
+                    for json_line in json_lines:
+                        yield json_line
+def collate_fn(batch):
+    """example: {title, prompt, content, result_list}
+    """
+    batch_token_ids, batch_token_type_ids, batch_start_ids, batch_end_ids = [], [], [], []
+    for example in batch:
+        token_ids, token_type_ids, offset_mapping = tokenizer.encode(example["prompt"], example["content"], 
+                                                                    maxlen=max_seq_len, return_offsets='transformers')
+        bias = 0
+        for index in range(len(offset_mapping)):
+            if index == 0:
+                continue
+            mapping = offset_mapping[index]
+            if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
+                bias = index
+            if mapping[0] == 0 and mapping[1] == 0:
+                continue
+            offset_mapping[index][0] += bias
+            offset_mapping[index][1] += bias
+        start_ids = [0 for _ in range(len(token_ids))]
+        end_ids = [0 for _ in range(len(token_ids))]
+        for item in example["result_list"]:
+            start = map_offset(item["start"] + bias, offset_mapping)
+            end = map_offset(item["end"] - 1 + bias, offset_mapping)
+            start_ids[start] = 1.0
+            end_ids[end] = 1.0
+        batch_token_ids.append(token_ids)
+        batch_token_type_ids.append(token_type_ids)
+        batch_start_ids.append(start_ids)
+        batch_end_ids.append(end_ids)
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_token_type_ids = torch.tensor(sequence_padding(batch_token_type_ids), dtype=torch.long, device=device)
+    batch_start_ids = torch.tensor(sequence_padding(batch_start_ids), dtype=torch.float, device=device)
+    batch_end_ids = torch.tensor(sequence_padding(batch_end_ids), dtype=torch.float, device=device)
+    return [batch_token_ids, batch_token_type_ids], [batch_start_ids, batch_end_ids]
+def map_offset(ori_offset, offset_mapping):
+    """map ori offset to token offset
+    """
+    for index, span in enumerate(offset_mapping):
+        if span[0] <= ori_offset < span[1]:
+            return index
+    return -1
+# 数据准备
+train_ds = IEDataset(train_path, tokenizer=tokenizer, max_seq_len=max_seq_len, fewshot=None)
+dev_ds = IEDataset(dev_path, tokenizer=tokenizer, max_seq_len=max_seq_len) 
+train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
+valid_dataloader = DataLoader(dev_ds, batch_size=batch_size, collate_fn=collate_fn)
+class MyLoss(nn.Module):
+    def forward(self, y_pred, y_true):
+        start_prob, end_prob = y_pred
+        start_ids, end_ids = y_true
+        loss_start = torch.nn.functional.binary_cross_entropy(start_prob, start_ids)
+        loss_end = torch.nn.functional.binary_cross_entropy(end_prob, end_ids)
+        return loss_start + loss_end
+uie_model.compile(
+    loss=MyLoss(),
+    optimizer=torch.optim.AdamW(lr=learning_rate, params=uie_model.parameters()),
+)
+class SpanEvaluator(Callback):
+    """SpanEvaluator computes the precision, recall and F1-score for span detection.
+    """
+    def __init__(self):
+        self.num_infer_spans = 0
+        self.num_label_spans = 0
+        self.num_correct_spans = 0
+        self.best_val_f1 = 0
+    def on_epoch_end(self, steps, epoch, logs=None):
+        f1, precision, recall = self.evaluate(valid_dataloader)
+        if f1 > self.best_val_f1:
+            self.best_val_f1 = f1
+            # model.save_weights('best_model.pt')
+        print(f'[val-entity level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
+    def evaluate(self, dataloder):
+        self.reset()
+        for x_true, y_true in dataloder:
+            start_prob, end_prob = uie_model.predict(*x_true)
+            start_ids, end_ids = y_true
+            num_correct, num_infer, num_label = self.compute(start_prob, end_prob, start_ids, end_ids)
+            self.update(num_correct, num_infer, num_label)
+        precision, recall, f1 = self.accumulate()
+        return f1, precision, recall
+    def compute(self, start_probs, end_probs, gold_start_ids, gold_end_ids):
+        """Computes the precision, recall and F1-score for span detection.
+        """
+        start_probs = start_probs.cpu().numpy()
+        end_probs = end_probs.cpu().numpy()
+        gold_start_ids = gold_start_ids.cpu().numpy()
+        gold_end_ids = gold_end_ids.cpu().numpy()
+        pred_start_ids = get_bool_ids_greater_than(start_probs)
+        pred_end_ids = get_bool_ids_greater_than(end_probs)
+        gold_start_ids = get_bool_ids_greater_than(gold_start_ids.tolist())
+        gold_end_ids = get_bool_ids_greater_than(gold_end_ids.tolist())
+        num_correct_spans = 0
+        num_infer_spans = 0
+        num_label_spans = 0
+        for predict_start_ids, predict_end_ids, label_start_ids, label_end_ids in zip(
+                pred_start_ids, pred_end_ids, gold_start_ids, gold_end_ids):
+            [_correct, _infer, _label] = self.eval_span(predict_start_ids, predict_end_ids, label_start_ids, label_end_ids)
+            num_correct_spans += _correct
+            num_infer_spans += _infer
+            num_label_spans += _label
+        return num_correct_spans, num_infer_spans, num_label_spans
+    def update(self, num_correct_spans, num_infer_spans, num_label_spans):
+        """
+        This function takes (num_infer_spans, num_label_spans, num_correct_spans) as input,
+        to accumulate and update the corresponding status of the SpanEvaluator object.
+        """
+        self.num_infer_spans += num_infer_spans
+        self.num_label_spans += num_label_spans
+        self.num_correct_spans += num_correct_spans
+    def eval_span(self, predict_start_ids, predict_end_ids, label_start_ids,
+                  label_end_ids):
+        """
+        evaluate position extraction (start, end)
+        return num_correct, num_infer, num_label
+        input: [1, 2, 10] [4, 12] [2, 10] [4, 11]
+        output: (1, 2, 2)
+        """
+        pred_set = get_span(predict_start_ids, predict_end_ids)
+        label_set = get_span(label_start_ids, label_end_ids)
+        num_correct = len(pred_set & label_set)
+        num_infer = len(pred_set)
+        num_label = len(label_set)
+        return (num_correct, num_infer, num_label)
+    def accumulate(self):
+        """
+        This function returns the mean precision, recall and f1 score for all accumulated minibatches.
+        Returns:
+            tuple: Returns tuple (`precision, recall, f1 score`).
+        """
+        precision = float(self.num_correct_spans / self.num_infer_spans) if self.num_infer_spans else 0.
+        recall = float(self.num_correct_spans / self.num_label_spans) if self.num_label_spans else 0.
+        f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_spans else 0.
+        return precision, recall, f1_score
+    def reset(self):
+        """
+        Reset function empties the evaluation memory for previous mini-batches.
+        """
+        self.num_infer_spans = 0
+        self.num_label_spans = 0
+        self.num_correct_spans = 0
+if __name__ == "__main__":
+    evaluator = SpanEvaluator()
+    print('zero_shot performance: ', evaluator.evaluate(valid_dataloader))
+    uie_model.fit(train_dataloader, epochs=num_epochs, steps_per_epoch=None, callbacks=[evaluator])
--- a/examples/sequence_labeling/uie/model.py
+++ b/examples/sequence_labeling/uie/model.py
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch.optim as optim
+from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
+from bert4torch.losses import FocalLoss
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel, BERT
+from tqdm import tqdm
+config_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+class UIE(BERT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        hidden_size = self.hidden_size
+        self.linear_start = nn.Linear(hidden_size, 1)
+        self.linear_end = nn.Linear(hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        if kwargs.get('use_task_id') and kwargs.get('use_task_id'):
+            # Add task type embedding to BERT
+            task_type_embeddings = nn.Embedding(kwargs.get('task_type_vocab_size'), self.hidden_size)
+            self.embeddings.task_type_embeddings = task_type_embeddings
+            def hook(module, input, output):
+                return output+task_type_embeddings(torch.zeros(input[0].size(), dtype=torch.int64, device=input[0].device))
+            self.embeddings.word_embeddings.register_forward_hook(hook)
+    def forward(self, token_ids, token_type_ids):
+        outputs = super().forward([token_ids, token_type_ids])
+        sequence_output = outputs[0]
+        start_logits = self.linear_start(sequence_output)
+        start_logits = torch.squeeze(start_logits, -1)
+        start_prob = self.sigmoid(start_logits)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = torch.squeeze(end_logits, -1)
+        end_prob = self.sigmoid(end_logits)
+        return start_prob, end_prob
+    @torch.no_grad()
+    def predict(self, token_ids, token_type_ids):
+        self.eval()
+        start_prob, end_prob = self.forward(token_ids, token_type_ids)
+        return start_prob, end_prob
+uie_model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=UIE, with_pool=True)
\ No newline at end of file
--- a/examples/sequence_labeling/uie/uie_predictor.py
+++ b/examples/sequence_labeling/uie/uie_predictor.py
+import numpy as np
+import math
+import torch
+from bert4torch.snippets import sequence_padding
+from utils import get_bool_ids_greater_than, get_span, get_id_and_prob, cut_chinese_sent, dbc2sbc
+from pprint import pprint
+import torch.nn.functional as F
+class UIEPredictor(object):
+    def __init__(self, schema, device='cpu', position_prob=0.5, max_seq_len=512, batch_size=64, split_sentence=False):
+        self._device = device
+        self._position_prob = position_prob
+        self._max_seq_len = max_seq_len
+        self._batch_size = 64
+        self._split_sentence = False
+        self._schema_tree = None
+        self.set_schema(schema)
+        from model import uie_model, tokenizer
+        self._tokenizer = tokenizer
+        self.model = uie_model.to(self._device)
+    def set_schema(self, schema):
+        if isinstance(schema, dict) or isinstance(schema, str):
+            schema = [schema]
+        self._schema_tree = self._build_tree(schema)
+    def __call__(self, inputs):
+        texts = inputs
+        texts = [texts] if isinstance(texts, str) else texts
+        results = self._multi_stage_predict(texts)
+        return results
+    def _multi_stage_predict(self, datas):
+        """构建schema tree和预测
+        """
+        results = [{} for _ in range(len(datas))]
+        # input check to early return
+        if len(datas) < 1 or self._schema_tree is None:
+            return results
+        # copy to stay `self._schema_tree` unchanged
+        schema_list = self._schema_tree.children[:]
+        while len(schema_list) > 0:
+            node = schema_list.pop(0)
+            examples = []
+            input_map = {}
+            cnt = 0
+            idx = 0
+            if not node.prefix:
+                for data in datas:
+                    examples.append({"text": data, "prompt": dbc2sbc(node.name)})
+                    input_map[cnt] = [idx]
+                    idx += 1
+                    cnt += 1
+            else:
+                for pre, data in zip(node.prefix, datas):
+                    if len(pre) == 0:
+                        input_map[cnt] = []
+                    else:
+                        for p in pre:
+                            examples.append({ "text": data, "prompt": dbc2sbc(p + node.name)})
+                        input_map[cnt] = [i + idx for i in range(len(pre))]
+                        idx += len(pre)
+                    cnt += 1
+            if len(examples) == 0:
+                result_list = []
+            else:
+                result_list = self._single_stage_predict(examples)
+            if not node.parent_relations:
+                relations = [[] for i in range(len(datas))]
+                for k, v in input_map.items():
+                    for idx in v:
+                        if len(result_list[idx]) == 0:
+                            continue
+                        if node.name not in results[k].keys():
+                            results[k][node.name] = result_list[idx]
+                        else:
+                            results[k][node.name].extend(result_list[idx])
+                    if node.name in results[k].keys():
+                        relations[k].extend(results[k][node.name])
+            else:
+                relations = node.parent_relations
+                for k, v in input_map.items():
+                    for i in range(len(v)):
+                        if len(result_list[v[i]]) == 0:
+                            continue
+                        if "relations" not in relations[k][i].keys():
+                            relations[k][i]["relations"] = {
+                                node.name: result_list[v[i]]
+                            }
+                        elif node.name not in relations[k][i]["relations"].keys(
+                        ):
+                            relations[k][i]["relations"][
+                                node.name] = result_list[v[i]]
+                        else:
+                            relations[k][i]["relations"][node.name].extend(
+                                result_list[v[i]])
+                new_relations = [[] for i in range(len(datas))]
+                for i in range(len(relations)):
+                    for j in range(len(relations[i])):
+                        if "relations" in relations[i][j].keys(
+                        ) and node.name in relations[i][j]["relations"].keys():
+                            for k in range(
+                                    len(relations[i][j]["relations"][
+                                        node.name])):
+                                new_relations[i].append(relations[i][j][
+                                    "relations"][node.name][k])
+                relations = new_relations
+            prefix = [[] for _ in range(len(datas))]
+            for k, v in input_map.items():
+                for idx in v:
+                    for i in range(len(result_list[idx])):
+                        prefix[k].append(result_list[idx][i]["text"] + "的")
+            for child in node.children:
+                child.prefix = prefix
+                child.parent_relations = relations
+                schema_list.append(child)
+        return results
+    def _convert_ids_to_results(self, examples, sentence_ids, probs):
+        """
+        Convert ids to raw text in a single stage.
+        """
+        results = []
+        for example, sentence_id, prob in zip(examples, sentence_ids, probs):
+            if len(sentence_id) == 0:
+                results.append([])
+                continue
+            result_list = []
+            text = example["text"]
+            prompt = example["prompt"]
+            for i in range(len(sentence_id)):
+                start, end = sentence_id[i]
+                if start < 0 and end >= 0:
+                    continue
+                if end < 0:
+                    start += (len(prompt) + 1)
+                    end += (len(prompt) + 1)
+                    result = {"text": prompt[start:end],
+                              "probability": prob[i]}
+                    result_list.append(result)
+                else:
+                    result = {
+                        "text": text[start:end],
+                        "start": start,
+                        "end": end,
+                        "probability": prob[i]
+                    }
+                    result_list.append(result)
+            results.append(result_list)
+        return results
+    def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
+        '''
+        Split the raw texts automatically for model inference.
+        Args:
+            input_texts (List[str]): input raw texts.
+            max_text_len (int): cutting length.
+            split_sentence (bool): If True, sentence-level split will be performed.
+        return:
+            short_input_texts (List[str]): the short input texts for model inference.
+            input_mapping (dict): mapping between raw text and short input texts.
+        '''
+        input_mapping = {}
+        short_input_texts = []
+        cnt_org = 0
+        cnt_short = 0
+        for text in input_texts:
+            if not split_sentence:
+                sens = [text]
+            else:
+                sens = cut_chinese_sent(text)
+            for sen in sens:
+                lens = len(sen)
+                if lens <= max_text_len:
+                    short_input_texts.append(sen)
+                    if cnt_org not in input_mapping.keys():
+                        input_mapping[cnt_org] = [cnt_short]
+                    else:
+                        input_mapping[cnt_org].append(cnt_short)
+                    cnt_short += 1
+                else:
+                    temp_text_list = [sen[i:i + max_text_len] for i in range(0, lens, max_text_len)]
+                    short_input_texts.extend(temp_text_list)
+                    short_idx = cnt_short
+                    cnt_short += math.ceil(lens / max_text_len)
+                    temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
+                    if cnt_org not in input_mapping.keys():
+                        input_mapping[cnt_org] = temp_text_id
+                    else:
+                        input_mapping[cnt_org].extend(temp_text_id)
+            cnt_org += 1
+        return short_input_texts, input_mapping
+    def _single_stage_predict(self, inputs):
+        input_texts = []
+        prompts = []
+        for i in range(len(inputs)):
+            input_texts.append(inputs[i]["text"])
+            prompts.append(inputs[i]["prompt"])
+        # max predict length should exclude the length of prompt and summary tokens
+        max_predict_len = self._max_seq_len - len(max(prompts)) - 3
+        short_input_texts, self.input_mapping = self._auto_splitter(input_texts, max_predict_len, split_sentence=self._split_sentence)
+        short_texts_prompts = []
+        for k, v in self.input_mapping.items():
+            short_texts_prompts.extend([prompts[k] for i in range(len(v))])
+        short_inputs = [{"text": short_input_texts[i], "prompt": short_texts_prompts[i]} for i in range(len(short_input_texts))]
+        token_ids, segment_ids, offset_maps = self._tokenizer.encode(short_texts_prompts, short_input_texts, maxlen=self._max_seq_len, return_offsets='transformers')
+        start_prob_concat, end_prob_concat = [], []
+        for batch_start in range(0, len(short_input_texts), self._batch_size):
+            batch_token_ids = token_ids[batch_start:batch_start+self._batch_size]
+            batch_segment_ids = segment_ids[batch_start:batch_start+self._batch_size]
+            batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=self._device)
+            batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=self._device)
+            start_prob, end_prob = self.model.predict(batch_token_ids, batch_segment_ids)
+            start_prob_concat.append(start_prob.cpu().numpy())
+            end_prob_concat.append(end_prob.cpu().numpy())
+        start_prob_concat = np.concatenate(start_prob_concat)
+        end_prob_concat = np.concatenate(end_prob_concat)
+        start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=self._position_prob, return_prob=True)
+        end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=self._position_prob, return_prob=True)
+        sentence_ids = []
+        probs = []
+        for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, token_ids, offset_maps):
+            for i in reversed(range(len(ids))):
+                if ids[i] != 0:
+                    ids = ids[:i]
+                    break
+            span_list = get_span(start_ids, end_ids, with_prob=True)
+            sentence_id, prob = get_id_and_prob(span_list, offset_map)
+            sentence_ids.append(sentence_id)
+            probs.append(prob)
+        results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
+        results = self._auto_joiner(results, short_input_texts, self.input_mapping)
+        return results
+    def _auto_joiner(self, short_results, short_inputs, input_mapping):
+        concat_results = []
+        is_cls_task = False
+        for short_result in short_results:
+            if short_result == []:
+                continue
+            elif 'start' not in short_result[0].keys(
+            ) and 'end' not in short_result[0].keys():
+                is_cls_task = True
+                break
+            else:
+                break
+        for k, vs in input_mapping.items():
+            if is_cls_task:
+                cls_options = {}
+                single_results = []
+                for v in vs:
+                    if len(short_results[v]) == 0:
+                        continue
+                    if short_results[v][0]['text'] not in cls_options.keys():
+                        cls_options[short_results[v][0][
+                            'text']] = [1, short_results[v][0]['probability']]
+                    else:
+                        cls_options[short_results[v][0]['text']][0] += 1
+                        cls_options[short_results[v][0]['text']][
+                            1] += short_results[v][0]['probability']
+                if len(cls_options) != 0:
+                    cls_res, cls_info = max(cls_options.items(),
+                                            key=lambda x: x[1])
+                    concat_results.append([{
+                        'text': cls_res,
+                        'probability': cls_info[1] / cls_info[0]
+                    }])
+                else:
+                    concat_results.append([])
+            else:
+                offset = 0
+                single_results = []
+                for v in vs:
+                    if v == 0:
+                        single_results = short_results[v]
+                        offset += len(short_inputs[v])
+                    else:
+                        for i in range(len(short_results[v])):
+                            if 'start' not in short_results[v][
+                                    i] or 'end' not in short_results[v][i]:
+                                continue
+                            short_results[v][i]['start'] += offset
+                            short_results[v][i]['end'] += offset
+                        offset += len(short_inputs[v])
+                        single_results.extend(short_results[v])
+                concat_results.append(single_results)
+        return concat_results
+    def predict(self, input_data):
+        results = self._multi_stage_predict(input_data)
+        return results
+    @classmethod
+    def _build_tree(cls, schema, name='root'):
+        """
+        Build the schema tree.
+        """
+        schema_tree = SchemaTree(name)
+        for s in schema:
+            if isinstance(s, str):
+                schema_tree.add_child(SchemaTree(s))
+            elif isinstance(s, dict):
+                for k, v in s.items():
+                    if isinstance(v, str):
+                        child = [v]
+                    elif isinstance(v, list):
+                        child = v
+                    else:
+                        raise TypeError("Invalid schema, value for each key:value pairs should be list or string but {} received".format(type(v)))
+                    schema_tree.add_child(cls._build_tree(child, name=k))
+            else:
+                raise TypeError("Invalid schema, element should be string or dict, but {} received".format(type(s)))
+        return schema_tree
+class SchemaTree(object):
+    """SchemaTree的实现
+    """
+    def __init__(self, name='root', children=None):
+        self.name = name
+        self.children = []
+        self.prefix = None
+        self.parent_relations = None
+        if children is not None:
+            for child in children:
+                self.add_child(child)
+    def __repr__(self):
+        return self.name
+    def add_child(self, node):
+        assert isinstance(node, SchemaTree), "The children of a node should be an instacne of SchemaTree."
+        self.children.append(node)
+if __name__ == '__main__':
+    # 命名实体识别
+    schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
+    ie = UIEPredictor(schema=schema)
+    pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌！"))
+    schema = ['肿瘤的大小', '肿瘤的个数', '肝癌级别', '脉管内癌栓分级']
+    ie.set_schema(schema)
+    pprint(ie("（右肝肿瘤）肝细胞性肝癌（II-III级，梁索型和假腺管型），肿瘤包膜不完整，紧邻肝被膜，侵及周围肝组织，未见脉管内癌栓（MVI分级：M0级）及卫星子灶形成。（肿物1个，大小4.2×4.0×2.8cm）。"))
+    # 关系抽取
+    schema = {'竞赛名称': ['主办方', '承办方', '已举办次数']}
+    ie.set_schema(schema) # Reset schema
+    pprint(ie('2022语言与智能技术竞赛由中国中文信息学会和中国计算机学会联合主办，百度公司、中国中文信息学会评测工作委员会和中国计算机学会自然语言处理专委会承办，已连续举办4届，成为全球最热门的中文NLP赛事之一。'))
+    # 事件抽取
+    schema = {'地震触发词': ['地震强度', '时间', '震中位置', '震源深度']}
+    ie.set_schema(schema) # Reset schema
+    ie('中国地震台网正式测定：5月16日06时08分在云南临沧市凤庆县(北纬24.34度，东经99.98度)发生3.5级地震，震源深度10千米。')
+    # 评论观点抽取
+    schema = {'评价维度': ['观点词', '情感倾向[正向，负向]']}
+    ie.set_schema(schema) # Reset schema
+    pprint(ie("店面干净，很清静，服务员服务热情，性价比很高，发现收银台有排队"))
+    # 情感倾向分类
+    schema = '情感倾向[正向，负向]'
+    ie.set_schema(schema)
+    ie('这个产品用起来真的很流畅，我非常喜欢')
\ No newline at end of file
--- a/examples/sequence_labeling/uie/utils.py
+++ b/examples/sequence_labeling/uie/utils.py
+import contextlib
+import functools
+import json
+import logging
+import math
+import random
+import re
+import shutil
+import threading
+import time
+from functools import partial
+import colorlog
+import numpy as np
+import torch
+from colorama import Back, Fore
+from tqdm import tqdm
+loggers = {}
+log_config = {
+    'DEBUG': {'level': 10, 'color': 'purple'},
+    'INFO': {'level': 20, 'color': 'green'},
+    'TRAIN': {'level': 21, 'color': 'cyan'},
+    'EVAL': {'level': 22, 'color': 'blue'},
+    'WARNING': {'level': 30, 'color': 'yellow'},
+    'ERROR': {'level': 40, 'color': 'red'},
+    'CRITICAL': {'level': 50, 'color': 'bold_red'}
+    }
+def get_span(start_ids, end_ids, with_prob=False):
+    """
+    Get span set from position start and end list.
+    Args:
+        start_ids (List[int]/List[tuple]): The start index list.
+        end_ids (List[int]/List[tuple]): The end index list.
+        with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
+    Returns:
+        set: The span set without overlapping, every id can only be used once .
+    """
+    if with_prob:
+        start_ids = sorted(start_ids, key=lambda x: x[0])
+        end_ids = sorted(end_ids, key=lambda x: x[0])
+    else:
+        start_ids = sorted(start_ids)
+        end_ids = sorted(end_ids)
+    start_pointer = 0
+    end_pointer = 0
+    len_start = len(start_ids)
+    len_end = len(end_ids)
+    couple_dict = {}
+    while start_pointer < len_start and end_pointer < len_end:
+        if with_prob:
+            start_id = start_ids[start_pointer][0]
+            end_id = end_ids[end_pointer][0]
+        else:
+            start_id = start_ids[start_pointer]
+            end_id = end_ids[end_pointer]
+        if start_id == end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            end_pointer += 1
+            continue
+        if start_id < end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            continue
+        if start_id > end_id:
+            end_pointer += 1
+            continue
+    result = [(couple_dict[end], end) for end in couple_dict]
+    result = set(result)
+    return result
+def get_bool_ids_greater_than(probs, limit=0.5, return_prob=False):
+    """
+    Get idx of the last dimension in probability arrays, which is greater than a limitation.
+    Args:
+        probs (List[List[float]]): The input probability arrays.
+        limit (float): The limitation for probability.
+        return_prob (bool): Whether to return the probability
+    Returns:
+        List[List[int]]: The index of the last dimension meet the conditions.
+    """
+    probs = np.array(probs)
+    dim_len = len(probs.shape)
+    if dim_len > 1:
+        result = []
+        for p in probs:
+            result.append(get_bool_ids_greater_than(p, limit, return_prob))
+        return result
+    else:
+        result = []
+        for i, p in enumerate(probs):
+            if p > limit:
+                if return_prob:
+                    result.append((i, p))
+                else:
+                    result.append(i)
+        return result
+class Logger(object):
+    '''
+    Deafult logger in UIE
+    Args:
+        name(str) : Logger name, default is 'UIE'
+    '''
+    def __init__(self, name: str = None):
+        name = 'UIE' if not name else name
+        self.logger = logging.getLogger(name)
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(
+                self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(
+                self.__call__, conf['level'])
+        self.format = colorlog.ColoredFormatter(
+            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+            log_colors={key: conf['color']
+                        for key, conf in log_config.items()})
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+    def disable(self):
+        self._is_enable = False
+    def enable(self):
+        self._is_enable = True
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+        self.logger.log(log_level, msg)
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float = 0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+logger = Logger()
+BAR_FORMAT = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET}  {Fore.GREEN}{{n_fmt}}/{{total_fmt}} {Fore.RED}{{rate_fmt}}{{postfix}}{Fore.RESET} eta {Fore.CYAN}{{remaining}}{Fore.RESET}'
+BAR_FORMAT_NO_TIME = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET}  {Fore.GREEN}{{n_fmt}}/{{total_fmt}}{Fore.RESET}'
+BAR_TYPE = [
+    "░▝▗▖▘▚▞▛▙█",
+    "░▖▘▝▗▚▞█",
+    " ▖▘▝▗▚▞█",
+    "░▒█",
+    " >=",
+    " ▏▎▍▌▋▊▉█"
+    "░▏▎▍▌▋▊▉█"
+]
+tqdm = partial(tqdm, bar_format=BAR_FORMAT, ascii=BAR_TYPE[0], leave=False)
+def get_id_and_prob(spans, offset_map):
+    prompt_length = 0
+    for i in range(1, len(offset_map)):
+        if offset_map[i] != [0, 0]:
+            prompt_length += 1
+        else:
+            break
+    for i in range(1, prompt_length + 1):
+        offset_map[i][0] -= (prompt_length + 1)
+        offset_map[i][1] -= (prompt_length + 1)
+    sentence_id = []
+    prob = []
+    for start, end in spans:
+        prob.append(start[1] * end[1])
+        sentence_id.append(
+            (offset_map[start[0]][0], offset_map[end[0]][1]))
+    return sentence_id, prob
+def cut_chinese_sent(para):
+    """
+    Cut the Chinese sentences more precisely, reference to 
+    "https://blog.csdn.net/blmoistawinde/article/details/82379256".
+    """
+    para = re.sub(r'([。！？\?])([^”’])', r'\1\n\2', para)
+    para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
+    para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
+    para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
+    para = para.rstrip()
+    return para.split("\n")
+def dbc2sbc(s):
+    rs = ""
+    for char in s:
+        code = ord(char)
+        if code == 0x3000:
+            code = 0x0020
+        else:
+            code -= 0xfee0
+        if not (0x0021 <= code and code <= 0x7e):
+            rs += char
+            continue
+        rs += chr(code)
+    return rs
+def convert_cls_examples(raw_examples, prompt_prefix, options):
+    examples = []
+    logger.info(f"Converting doccano data...")
+    with tqdm(total=len(raw_examples)) as pbar:
+        for line in raw_examples:
+            items = json.loads(line)
+            # Compatible with doccano >= 1.6.2
+            if "data" in items.keys():
+                text, labels = items["data"], items["label"]
+            else:
+                text, labels = items["text"], items["label"]
+            random.shuffle(options)
+            prompt = ""
+            sep = ","
+            for option in options:
+                prompt += option
+                prompt += sep
+            prompt = prompt_prefix + "[" + prompt.rstrip(sep) + "]"
+            result_list = []
+            example = {
+                "content": text,
+                "result_list": result_list,
+                "prompt": prompt
+            }
+            for label in labels:
+                start = prompt.rfind(label[0]) - len(prompt) - 1
+                end = start + len(label)
+                result = {"text": label, "start": start, "end": end}
+                example["result_list"].append(result)
+            examples.append(example)
+    return examples
+def add_negative_example(examples, texts, prompts, label_set, negative_ratio):
+    negative_examples = []
+    positive_examples = []
+    with tqdm(total=len(prompts)) as pbar:
+        for i, prompt in enumerate(prompts):
+            negative_sample = []
+            redundants_list = list(set(label_set) ^ set(prompt))
+            redundants_list.sort()
+            num_positive = len(examples[i])
+            if num_positive != 0:
+                actual_ratio = math.ceil(len(redundants_list) / num_positive)
+            else:
+                # Set num_positive to 1 for text without positive example
+                num_positive, actual_ratio = 1, 0
+            if actual_ratio <= negative_ratio or negative_ratio == -1:
+                idxs = [k for k in range(len(redundants_list))]
+            else:
+                idxs = random.sample(
+                    range(0, len(redundants_list)),
+                    negative_ratio * num_positive)
+            for idx in idxs:
+                negative_result = {
+                    "content": texts[i],
+                    "result_list": [],
+                    "prompt": redundants_list[idx]
+                }
+                negative_examples.append(negative_result)
+            positive_examples.extend(examples[i])
+            pbar.update(1)
+    return positive_examples, negative_examples
+def add_full_negative_example(examples, texts, relation_prompts, predicate_set,
+                              subject_goldens):
+    with tqdm(total=len(relation_prompts)) as pbar:
+        for i, relation_prompt in enumerate(relation_prompts):
+            negative_sample = []
+            for subject in subject_goldens[i]:
+                for predicate in predicate_set:
+                    # The relation prompt is constructed as follows:
+                    # subject + "的" + predicate
+                    prompt = subject + "的" + predicate
+                    if prompt not in relation_prompt:
+                        negative_result = {
+                            "content": texts[i],
+                            "result_list": [],
+                            "prompt": prompt
+                        }
+                        negative_sample.append(negative_result)
+            examples[i].extend(negative_sample)
+            pbar.update(1)
+    return examples
+def construct_relation_prompt_set(entity_name_set, predicate_set):
+    relation_prompt_set = set()
+    for entity_name in entity_name_set:
+        for predicate in predicate_set:
+            # The relation prompt is constructed as follows:
+            # subject + "的" + predicate
+            relation_prompt = entity_name + "的" + predicate
+            relation_prompt_set.add(relation_prompt)
+    return sorted(list(relation_prompt_set))
+def convert_ext_examples(raw_examples, negative_ratio, is_train=True):
+    texts = []
+    entity_examples = []
+    relation_examples = []
+    entity_prompts = []
+    relation_prompts = []
+    entity_label_set = []
+    entity_name_set = []
+    predicate_set = []
+    subject_goldens = []
+    logger.info(f"Converting doccano data...")
+    with tqdm(total=len(raw_examples)) as pbar:
+        for line in raw_examples:
+            items = json.loads(line)
+            entity_id = 0
+            if "data" in items.keys():
+                relation_mode = False
+                if isinstance(items["label"],
+                              dict) and "entities" in items["label"].keys():
+                    relation_mode = True
+                text = items["data"]
+                entities = []
+                if not relation_mode:
+                    # Export file in JSONL format which doccano < 1.7.0
+                    for item in items["label"]:
+                        entity = {
+                            "id": entity_id,
+                            "start_offset": item[0],
+                            "end_offset": item[1],
+                            "label": item[2]
+                        }
+                        entities.append(entity)
+                        entity_id += 1
+                else:
+                    # Export file in JSONL format for relation labeling task which doccano < 1.7.0
+                    for item in items["label"]["entities"]:
+                        entity = {
+                            "id": entity_id,
+                            "start_offset": item["start_offset"],
+                            "end_offset": item["end_offset"],
+                            "label": item["label"]
+                        }
+                        entities.append(entity)
+                        entity_id += 1
+                relations = []
+            else:
+                # Export file in JSONL format which doccano >= 1.7.0
+                if "label" in items.keys():
+                    text = items["text"]
+                    entities = []
+                    for item in items["label"]:
+                        entity = {
+                            "id": entity_id,
+                            "start_offset": item[0],
+                            "end_offset": item[1],
+                            "label": item[2]
+                        }
+                        entities.append(entity)
+                        entity_id += 1
+                    relations = []
+                else:
+                    # Export file in JSONL (relation) format
+                    text, relations, entities = items["text"], items[
+                        "relations"], items["entities"]
+            texts.append(text)
+            entity_example = []
+            entity_prompt = []
+            entity_example_map = {}
+            entity_map = {}  # id to entity name
+            for entity in entities:
+                entity_name = text[entity["start_offset"]:entity["end_offset"]]
+                entity_map[entity["id"]] = {
+                    "name": entity_name,
+                    "start": entity["start_offset"],
+                    "end": entity["end_offset"]
+                }
+                entity_label = entity["label"]
+                result = {
+                    "text": entity_name,
+                    "start": entity["start_offset"],
+                    "end": entity["end_offset"]
+                }
+                if entity_label not in entity_example_map.keys():
+                    entity_example_map[entity_label] = {
+                        "content": text,
+                        "result_list": [result],
+                        "prompt": entity_label
+                    }
+                else:
+                    entity_example_map[entity_label]["result_list"].append(
+                        result)
+                if entity_label not in entity_label_set:
+                    entity_label_set.append(entity_label)
+                if entity_name not in entity_name_set:
+                    entity_name_set.append(entity_name)
+                entity_prompt.append(entity_label)
+            for v in entity_example_map.values():
+                entity_example.append(v)
+            entity_examples.append(entity_example)
+            entity_prompts.append(entity_prompt)
+            subject_golden = []
+            relation_example = []
+            relation_prompt = []
+            relation_example_map = {}
+            for relation in relations:
+                predicate = relation["type"]
+                subject_id = relation["from_id"]
+                object_id = relation["to_id"]
+                # The relation prompt is constructed as follows:
+                # subject + "的" + predicate
+                prompt = entity_map[subject_id]["name"] + "的" + predicate
+                if entity_map[subject_id]["name"] not in subject_golden:
+                    subject_golden.append(entity_map[subject_id]["name"])
+                result = {
+                    "text": entity_map[object_id]["name"],
+                    "start": entity_map[object_id]["start"],
+                    "end": entity_map[object_id]["end"]
+                }
+                if prompt not in relation_example_map.keys():
+                    relation_example_map[prompt] = {
+                        "content": text,
+                        "result_list": [result],
+                        "prompt": prompt
+                    }
+                else:
+                    relation_example_map[prompt]["result_list"].append(result)
+                if predicate not in predicate_set:
+                    predicate_set.append(predicate)
+                relation_prompt.append(prompt)
+            for v in relation_example_map.values():
+                relation_example.append(v)
+            relation_examples.append(relation_example)
+            relation_prompts.append(relation_prompt)
+            subject_goldens.append(subject_golden)
+            pbar.update(1)
+    def concat_examples(positive_examples, negative_examples, negative_ratio):
+        examples = []
+        if math.ceil(len(negative_examples) /
+                     len(positive_examples)) <= negative_ratio:
+            examples = positive_examples + negative_examples
+        else:
+            # Random sampling the negative examples to ensure overall negative ratio unchanged.
+            idxs = random.sample(
+                range(0, len(negative_examples)),
+                negative_ratio * len(positive_examples))
+            negative_examples_sampled = []
+            for idx in idxs:
+                negative_examples_sampled.append(negative_examples[idx])
+            examples = positive_examples + negative_examples_sampled
+        return examples
+    logger.info(f"Adding negative samples for first stage prompt...")
+    positive_examples, negative_examples = add_negative_example(
+        entity_examples, texts, entity_prompts, entity_label_set,
+        negative_ratio)
+    if len(positive_examples) == 0:
+        all_entity_examples = []
+    elif is_train:
+        all_entity_examples = concat_examples(positive_examples,
+                                              negative_examples, negative_ratio)
+    else:
+        all_entity_examples = positive_examples + negative_examples
+    all_relation_examples = []
+    if len(predicate_set) != 0:
+        if is_train:
+            logger.info(f"Adding negative samples for second stage prompt...")
+            relation_prompt_set = construct_relation_prompt_set(entity_name_set,
+                                                                predicate_set)
+            positive_examples, negative_examples = add_negative_example(
+                relation_examples, texts, relation_prompts, relation_prompt_set,
+                negative_ratio)
+            all_relation_examples = concat_examples(
+                positive_examples, negative_examples, negative_ratio)
+        else:
+            logger.info(f"Adding negative samples for second stage prompt...")
+            relation_examples = add_full_negative_example(
+                relation_examples, texts, relation_prompts, predicate_set,
+                subject_goldens)
+            all_relation_examples = [
+                r
+                for r in relation_example
+                for relation_example in relation_examples
+            ]
+    return all_entity_examples, all_relation_examples
+def get_path_from_url(url,
+                      root_dir,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        decompress (bool): decompress zip or tar file. Default is `True`
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+    import os.path
+    import os
+    import tarfile
+    import zipfile
+    def is_url(path):
+        """
+        Whether path is URL.
+        Args:
+            path (string): URL string or not.
+        """
+        return path.startswith('http://') or path.startswith('https://')
+    def _map_path(url, root_dir):
+        # parse path after download under root_dir
+        fname = os.path.split(url)[-1]
+        fpath = fname
+        return os.path.join(root_dir, fpath)
+    def _get_download(url, fullname):
+        import requests
+        # using requests.get method
+        fname = os.path.basename(fullname)
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info("Downloading {} from {} failed with exception {}".format(
+                fname, url, str(e)))
+            return False
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+        return fullname
+    def _download(url, path):
+        """
+        Download from url, save to path.
+        url (str): download url
+        path (str): download to given path
+        """
+        if not os.path.exists(path):
+            os.makedirs(path)
+        fname = os.path.split(url)[-1]
+        fullname = os.path.join(path, fname)
+        retry_cnt = 0
+        logger.info("Downloading {} from {}".format(fname, url))
+        DOWNLOAD_RETRY_LIMIT = 3
+        while not os.path.exists(fullname):
+            if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+                retry_cnt += 1
+            else:
+                raise RuntimeError("Download from {} failed. "
+                                   "Retry limit reached".format(url))
+            if not _get_download(url, fullname):
+                time.sleep(1)
+                continue
+        return fullname
+    def _uncompress_file_zip(filepath):
+        with zipfile.ZipFile(filepath, 'r') as files:
+            file_list = files.namelist()
+            file_dir = os.path.dirname(filepath)
+            if _is_a_single_file(file_list):
+                rootpath = file_list[0]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                files.extractall(file_dir)
+            elif _is_a_single_dir(file_list):
+                # `strip(os.sep)` to remove `os.sep` in the tail of path
+                rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
+                    os.sep)[-1]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                files.extractall(file_dir)
+            else:
+                rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                if not os.path.exists(uncompressed_path):
+                    os.makedirs(uncompressed_path)
+                files.extractall(os.path.join(file_dir, rootpath))
+            return uncompressed_path
+    def _is_a_single_file(file_list):
+        if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
+            return True
+        return False
+    def _is_a_single_dir(file_list):
+        new_file_list = []
+        for file_path in file_list:
+            if '/' in file_path:
+                file_path = file_path.replace('/', os.sep)
+            elif '\\' in file_path:
+                file_path = file_path.replace('\\', os.sep)
+            new_file_list.append(file_path)
+        file_name = new_file_list[0].split(os.sep)[0]
+        for i in range(1, len(new_file_list)):
+            if file_name != new_file_list[i].split(os.sep)[0]:
+                return False
+        return True
+    def _uncompress_file_tar(filepath, mode="r:*"):
+        with tarfile.open(filepath, mode) as files:
+            file_list = files.getnames()
+            file_dir = os.path.dirname(filepath)
+            if _is_a_single_file(file_list):
+                rootpath = file_list[0]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                files.extractall(file_dir)
+            elif _is_a_single_dir(file_list):
+                rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
+                    os.sep)[-1]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                files.extractall(file_dir)
+            else:
+                rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+                uncompressed_path = os.path.join(file_dir, rootpath)
+                if not os.path.exists(uncompressed_path):
+                    os.makedirs(uncompressed_path)
+                files.extractall(os.path.join(file_dir, rootpath))
+            return uncompressed_path
+    def _decompress(fname):
+        """
+        Decompress for zip and tar file
+        """
+        logger.info("Decompressing {}...".format(fname))
+        # For protecting decompressing interupted,
+        # decompress to fpath_tmp directory firstly, if decompress
+        # successed, move decompress files to fpath and delete
+        # fpath_tmp and remove download compress file.
+        if tarfile.is_tarfile(fname):
+            uncompressed_path = _uncompress_file_tar(fname)
+        elif zipfile.is_zipfile(fname):
+            uncompressed_path = _uncompress_file_zip(fname)
+        else:
+            raise TypeError("Unsupport compress file type {}".format(fname))
+        return uncompressed_path
+    assert is_url(url), "downloading from {} not a url".format(url)
+    fullpath = _map_path(url, root_dir)
+    if os.path.exists(fullpath) and check_exist:
+        logger.info("Found {}".format(fullpath))
+    else:
+        fullpath = _download(url, root_dir)
+    if decompress and (tarfile.is_tarfile(fullpath) or
+                       zipfile.is_zipfile(fullpath)):
+        fullpath = _decompress(fullpath)
+    return fullpath
--- a/examples/training_trick/ddp.py
+++ b/examples/training_trick/ddp.py
+#! -*- coding:utf-8 -*-
+# DDP示例
+# 启动命令：python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 task_distributed_data_parallel.py
+import os
+# 也可命令行传入
+os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModelDDP
+from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import random, os, numpy as np
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--local_rank", type=int, default=-1)
+args = parser.parse_args()
+torch.cuda.set_device(args.local_rank)
+device = torch.device('cuda', args.local_rank)
+torch.distributed.init_process_group(backend='nccl')
+# 模型设置
+maxlen = 256
+batch_size = 16
+config_path = '/datasets/bert-base-chinese/config.json'
+checkpoint_path = '/datasets/bert-base-chinese/pytorch_model.bin'
+dict_path = '/datasets/bert-base-chinese/vocab.txt'
+# 固定seed
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filename):
+        D = []
+        with open(filename, encoding='utf-8') as f:
+            f = f.read()
+            for l in f.split('\n\n'):
+                if not l:
+                    continue
+                d = ['']
+                for i, c in enumerate(l.split('\n')):
+                    char, flag = c.split(' ')
+                    d[0] += char
+                    if flag[0] == 'B':
+                        d.append([i, i, flag[2:]])
+                    elif flag[0] == 'I':
+                        d[-1][1] = i
+                D.append(d)
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+    for text, label in batch:
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
+# 加载数据集
+train_dataset = MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
+train_sampler = DistributedSampler(train_dataset)
+train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构，这里loss并不是放在模型里计算的
+class Model1(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, token_ids, segment_ids, labels):
+        _, pooled_output = self.bert([token_ids, segment_ids])
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        loss = self.loss_fn(output, labels)
+        return loss
+class Model(BaseModel):
+    def __init__(self):
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
+        self.fc = nn.Linear(768, len(categories))  # 包含首尾
+        self.crf = CRF(len(categories))
+    def forward(self, token_ids):
+        sequence_output = self.bert([token_ids])  # [btz, seq_len, hdsz]
+        emission_score = self.fc(sequence_output)  # [btz, seq_len, tag_size]
+        attention_mask = token_ids.gt(0).long()
+        return emission_score, attention_mask
+    def predict(self, token_ids):
+        self.eval()
+        with torch.no_grad():
+            emission_score, attention_mask = self.forward(token_ids)
+            best_path = self.crf.decode(emission_score, attention_mask)  # [btz, seq_len]
+        return best_path
+model = Model().to(device)
+# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
+model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=lambda x, _: x,  # 直接把forward计算的loss传出来
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+)
+if __name__ == '__main__':
+    model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
--- a/examples/training_trick/task_amp.py
+++ b/examples/training_trick/task_amp.py
+#! -*- coding:utf-8 -*-
+# 混合精度训练示例，测试中显存占用降低了15%
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import random, os, numpy as np
+from torch.utils.data import DataLoader
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 固定seed
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+    for text, label in batch:
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids, segment_ids):
+        _, pooled_output = self.bert([token_ids, segment_ids])
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=nn.CrossEntropyLoss(),
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+    use_amp=True,  # True表示使用梯度累积
+    metrics=['accuracy'],
+)
+if __name__ == '__main__':
+    model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
--- a/examples/training_trick/task_data_parallel.py
+++ b/examples/training_trick/task_data_parallel.py
+#! -*- coding:utf-8 -*-
+# DP示例，这里是把loss放在模型里计算的话，则可以部分缓解负载不均衡的问题
+import os
+# 也可命令行传入
+os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel, BaseModelDP
+from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import random, os, numpy as np
+from torch.utils.data import DataLoader
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 固定seed
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+    for text, label in batch:
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+# 定义bert上的模型结构，这里loss并不是放在模型里计算的
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, token_ids, segment_ids, labels):
+        _, pooled_output = self.bert([token_ids, segment_ids])
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        loss = self.loss_fn(output, labels)
+        return loss
+model = Model().to(device)
+model = BaseModelDP(model)  # 指定DP模型使用多gpu
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=lambda x, _: x.mean(),  # 多个gpu计算的loss的均值
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+)
+if __name__ == '__main__':
+    model.fit(train_dataloader, epochs=20, steps_per_epoch=10)
--- a/examples/training_trick/task_distributed_data_parallel.py
+++ b/examples/training_trick/task_distributed_data_parallel.py
+#! -*- coding:utf-8 -*-
+# DDP示例
+# 启动命令：python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 task_distributed_data_parallel.py
+import os
+# 也可命令行传入
+os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModelDDP
+from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import random, os, numpy as np
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--local_rank", type=int, default=-1)
+args = parser.parse_args()
+torch.cuda.set_device(args.local_rank)
+device = torch.device('cuda', args.local_rank)
+torch.distributed.init_process_group(backend='nccl')
+# 模型设置
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+# 固定seed
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+    for text, label in batch:
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids.append(token_ids)
+        batch_segment_ids.append(segment_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
+# 加载数据集
+train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
+train_sampler = DistributedSampler(train_dataset)
+train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构，这里loss并不是放在模型里计算的
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+        self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, token_ids, segment_ids, labels):
+        _, pooled_output = self.bert([token_ids, segment_ids])
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        loss = self.loss_fn(output, labels)
+        return loss
+model = Model().to(device)
+# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
+model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=lambda x, _: x,  # 直接把forward计算的loss传出来
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+)
+if __name__ == '__main__':
+    model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
--- a/examples/training_trick/task_sentiment_R-Drop.py
+++ b/examples/training_trick/task_sentiment_R-Drop.py
+#! -*- coding:utf-8 -*-
+# 通过R-Drop增强模型的泛化性能
+# 官方项目：https://github.com/dropreg/R-Drop
+# 数据集：情感分类数据集
+from bert4torch.models import build_transformer_model, BaseModel
+import torch
+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch.optim as optim
+from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything, text_segmentate, get_pool_emb
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.losses import RDropLoss
+from tqdm import tqdm
+import torch.nn.functional as F
+maxlen = 256
+batch_size = 16
+# BERT base
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_labels = [], []
+    for text, label in batch:
+        token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
+        for _ in range(2):
+            batch_token_ids.append(token_ids)
+            batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls') -> None:
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert= build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, dropout_rate=0.3, segment_vocab_size=0, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+model.compile(loss=RDropLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=['accuracy'])
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for x_true, y_true in data:
+            y_pred = model.predict(x_true).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+else: 
+    model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_TemporalEnsembling.py
+++ b/examples/training_trick/task_sentiment_TemporalEnsembling.py
+#! -*- coding:utf-8 -*-
+# 通过TemporalEnsembling提升模型泛化
+# 官方项目：https://github.com/s-laine/tempens
+# pytorch第三方实现：https://github.com/ferretj/temporal-ensembling
+# 数据集：情感分类数据集
+# 本示例是把监督数据当成无监督数据使用
+from bert4torch.models import build_transformer_model, BaseModel
+import torch
+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch.optim as optim
+from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything, text_segmentate, get_pool_emb
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.losses import TemporalEnsemblingLoss
+maxlen = 256
+batch_size = 16
+epochs = 10
+# BERT base
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_labels = [], []
+    for text, label in batch:
+        token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids.append(token_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集，训练数据集shuffle=False
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=False, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls') -> None:
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert= build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0, with_pool=True)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+class MyLoss(TemporalEnsemblingLoss):
+    def forward(self, y_pred, y_true):
+        # 监督数据当成无监督数据使用，真实场景中可以用大量的无监督数据来使用
+        y_pred_sup, y_pred_unsup, y_true_sup = y_pred, y_pred, y_true
+        return super().forward(y_pred_sup, y_pred_unsup, y_true_sup, model.epoch, model.bti)
+loss = MyLoss(epochs=epochs, max_batch_num=None)
+model.compile(loss=loss, optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=['accuracy'])
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for x_true, y_true in data:
+            y_pred = model.predict(x_true).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=epochs, steps_per_epoch=None, callbacks=[evaluator])
+else: 
+    model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_UDA.py
+++ b/examples/training_trick/task_sentiment_UDA.py
+#! -*- coding:utf-8 -*-
+# 以文本分类（情感分类）为例的半监督学习UDA策略，https://arxiv.org/abs/1904.12848
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
+from bert4torch.losses import UDALoss
+import torch.nn as nn
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import numpy as np
+import random
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
+valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
+test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
+# 理论上应该收集任务领域类的无监督数据，这里用所有的监督数据来作无监督数据
+unsup_dataset =  [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
+def collate_fn(batch):
+    def add_noise(token_ids, del_ratio=0.3):
+        '''这里用随机删除做简单示例，实际中可以使用增删改等多种noise方案
+        '''
+        n = len(token_ids)
+        keep_or_not = np.random.rand(n) > del_ratio
+        if sum(keep_or_not) == 0:
+            keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
+        return list(np.array(token_ids)[keep_or_not])
+    # batch_token_ids包含三部分，第一部分是有监督数据，第二部分是领域类的无监督数据，第三部分是无监督数据经数据增强后的数据
+    batch_token_ids, batch_labels = [[], [], []], []
+    for text, label in batch:
+        token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
+        batch_token_ids[0].append(token_ids)
+        batch_labels.append([label])
+        # 无监督部分
+        unsup_text = random.choice(unsup_dataset)  # 随机挑一个无监督数据
+        token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
+        batch_token_ids[1].append(token_ids)
+        batch_token_ids[2].append(token_ids[:1] + add_noise(token_ids[1:-1]) + token_ids[-1:])  # 无监督数据增强
+    batch_token_ids = [j for i in batch_token_ids for j in i]
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(test_dataset,  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls'):
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+class Loss(UDALoss):
+    def forward(self, y_pred, y_true_sup):
+        loss, loss_sup, loss_unsup = super().forward(y_pred, y_true_sup, model.global_step, model.total_steps)
+        return {'loss': loss, 'loss_sup': loss_sup, 'loss_unsup': loss_unsup}
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=Loss(tsa_schedule='linear_schedule', start_p=0.8),  # 这里可换用不同的策略, 不为None时候要给定model
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+    metrics=['loss_sup', 'loss_unsup']  # Loss返回的key会自动计入metrics，下述metrics不写仍可以打印具体的Loss
+)
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for token_ids, y_true in data:
+            y_pred = model.predict(token_ids[:y_true.size(0)]).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+else:
+    model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_adversarial_training.py
+++ b/examples/training_trick/task_sentiment_adversarial_training.py
+#! -*- coding:utf-8 -*-
+# 通过对抗训/梯度惩罚练增强模型的泛化性能，包含fgm, pgs, vat，梯度惩罚
+# 数据集：情感分类数据集
+# 对抗训练：https://kexue.fm/archives/7234
+# 虚拟对抗训练：https://kexue.fm/archives/7466
+# 梯度惩罚：https://kexue.fm/archives/7234
+from bert4torch.models import build_transformer_model, BaseModel
+import torch
+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch.optim as optim
+from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, get_pool_emb, seed_everything
+from bert4torch.tokenizers import Tokenizer
+import sys
+maxlen = 256
+batch_size = 16
+# BERT base
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_labels = [], []
+    for text, label in batch:
+        token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
+        batch_token_ids.append(token_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls') -> None:
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+# 传参方式
+mode = sys.argv[1]
+adversarial_train = {'name': mode}
+print(f'Using {mode}'.center(60, '='))
+# debug方式
+# 具体参数设置可以到bert4torch.models/bert4torch.snippets里
+# adversarial_train = {'name': 'fgm'}  # fgm方式
+# adversarial_train = {'name': 'pgd'}  # pgd方式
+# adversarial_train = {'name': 'gradient_penalty'}  # 梯度惩罚
+# adversarial_train = {'name': 'vat'}  # 虚拟对抗，这里仅为使用有监督数据的示例
+model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), 
+              metrics=['accuracy'], adversarial_train=adversarial_train)
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for x_true, y_true in data:
+            y_pred = model.predict(x_true).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+else: 
+    model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_exponential_moving_average.py
+++ b/examples/training_trick/task_sentiment_exponential_moving_average.py
+#! -*- coding:utf-8 -*-
+# 情感分类任务, 指数滑动平均
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
+from bert4torch.optimizers import extend_with_exponential_moving_average
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import random, os, numpy as np
+from torch.utils.data import DataLoader
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_labels = [], []
+    for text, label in batch:
+        token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
+        batch_token_ids.append(token_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls') -> None:
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+ema_schedule = extend_with_exponential_moving_average(model, decay=0.99)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=nn.CrossEntropyLoss(),
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+    scheduler=ema_schedule,
+    metrics=['accuracy']
+)
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        ema_schedule.apply_ema_weights()  # 使用滑动平均的ema权重
+        total, right = 0., 0.
+        for x_true, y_true in data:
+            y_pred = model.predict(x_true).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        ema_schedule.restore_raw_weights()  # 恢复原来模型的参数
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+else:
+    model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_mixup.py
+++ b/examples/training_trick/task_sentiment_mixup.py
+#! -*- coding:utf-8 -*-
+# 情感分类任务, 加载bert权重
+# Mixup策略，包含embedding，hidden, encoder的mixup
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.layers import MixUp
+from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
+import torch.nn as nn
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+choice = 'train'  # train表示训练，infer表示推理
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+def collate_fn(batch):
+    batch_token_ids, batch_labels = [], []
+    for text, label in batch:
+        token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
+        batch_token_ids.append(token_ids)
+        batch_labels.append([label])
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']),  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, mixup_method='encoder', pool_method='cls') -> None:
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+        self.mixup = MixUp(method=mixup_method)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.mixup.encode(self.bert, [token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        y_pred = self.dense(output)
+        return y_pred
+    def predict(self, token_ids):
+        self.eval()
+        with torch.no_grad():
+            hidden_states, pooling = self.bert([token_ids])
+            pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
+            output = self.dropout(pooled_output)
+            y_pred = self.dense(output)
+        return y_pred
+model = Model().to(device)
+class Loss(nn.Module):
+    def forward(self, y_pred, y_true):
+        return model.mixup(nn.CrossEntropyLoss(), y_pred, y_true)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=Loss(),
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+)
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for x_true, y_true in data:
+            y_pred = model.predict(x_true).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    if choice == 'train':
+        evaluator = Evaluator()
+        model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+    else:
+        model.load_weights('best_model.pt')
--- a/examples/training_trick/task_sentiment_virtual_adversarial_training.py
+++ b/examples/training_trick/task_sentiment_virtual_adversarial_training.py
+#! -*- coding:utf-8 -*-
+# 以文本分类为例的半监督学习，虚拟对抗训练策略
+# 监督数据部分只计算监督Loss, 有监督+无监督数据计算对抗训练的Loss
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
+import torch.nn as nn
+import torch
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import random
+maxlen = 256
+batch_size = 16
+config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+seed_everything(42)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 加载数据集
+class MyDataset(ListDataset):
+    @staticmethod
+    def load_data(filenames):
+        """加载数据，并尽量划分为不超过maxlen的句子
+        """
+        D = []
+        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+        for filename in filenames:
+            with open(filename, encoding='utf-8') as f:
+                for l in f:
+                    text, label = l.strip().split('\t')
+                    for t in text_segmentate(text, maxlen - 2, seps, strips):
+                        D.append((t, int(label)))
+        return D
+train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
+valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
+test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
+# 理论上应该收集任务领域类的无监督数据，这里用所有的监督数据来作无监督数据
+unsup_dataset =  [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
+def collate_fn(batch):
+    # batch_token_ids包含两部部分，第一部分是有监督数据，第二部分是无监督数据
+    batch_token_ids, batch_labels = [[], []], []
+    for text, label in batch:
+        token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
+        batch_token_ids[0].append(token_ids)
+        batch_labels.append([label])
+        # 无监督部分
+        unsup_text = random.choice(unsup_dataset)  # 随机挑一个无监督数据
+        token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
+        batch_token_ids[1].append(token_ids)
+    batch_token_ids = [j for i in batch_token_ids for j in i]
+    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
+    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
+    return batch_token_ids, batch_labels.flatten()
+# 加载数据集
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 
+valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn) 
+test_dataloader = DataLoader(test_dataset,  batch_size=batch_size, collate_fn=collate_fn) 
+# 定义bert上的模型结构
+class Model(BaseModel):
+    def __init__(self, pool_method='cls'):
+        super().__init__()
+        self.pool_method = pool_method
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
+        self.dropout = nn.Dropout(0.1)
+        self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
+    def forward(self, token_ids):
+        hidden_states, pooling = self.bert([token_ids])
+        pooled_output = get_pool_emb(hidden_states, pooling, token_ids[0].gt(0).long(), self.pool_method)
+        output = self.dropout(pooled_output)
+        output = self.dense(output)
+        return output
+model = Model().to(device)
+class MyLoss(nn.Module):
+    def forward(self, y_pred, y_true_sup):
+        y_pred_sup = y_pred[:y_true_sup.shape[0]]  # 仅计算监督部分loss
+        return F.cross_entropy(y_pred_sup, y_true_sup)
+# 定义使用的loss和optimizer，这里支持自定义
+model.compile(
+    loss=MyLoss(),
+    optimizer=optim.Adam(model.parameters(), lr=2e-5),
+    adversarial_train = {'name': 'vat', 'adv_alpha': 1}  # 虚拟对抗
+)
+class Evaluator(Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, global_step, epoch, logs=None):
+        val_acc = self.evaluate(valid_dataloader)
+        test_acc = self.evaluate(test_dataloader)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model.pt')
+        print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
+    # 定义评价函数
+    def evaluate(self, data):
+        total, right = 0., 0.
+        for inputs, y_true in data:
+            inputs = [inputs[0][:y_true.size(0)]]  # 仅计算有监督部分
+            y_pred = model.predict(inputs).argmax(axis=1)
+            total += len(y_true)
+            right += (y_true == y_pred).sum().item()
+        return right / total
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
+else:
+    model.load_weights('best_model.pt')
--- a/requirements.txt
+++ b/requirements.txt
+six
+packaging 
+tqdm
--- a/setup.py
+++ b/setup.py
+#! -*- coding: utf-8 -*-
+from setuptools import setup, find_packages
+setup(
+    name='bert4torch',
+    version='0.1.9',
+    description='an elegant bert4torch',
+    long_description='bert4torch: https://github.com/Tongjilibo/bert4torch',
+    license='MIT Licence',
+    url='https://github.com/Tongjilibo/bert4torch',
+    author='Tongjilibo',
+    install_requires=['torch>1.6'],
+    packages=find_packages()
+)
\ No newline at end of file