Commit 66a1d0d0 authored by yangzhong's avatar yangzhong
Browse files

提交初版bert4torch project

parents
Pipeline #519 canceled with stages
# 数据生成1
python finetune_step1_dataprocess.py
# 数据生成2
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/train.json \
--task_type "ext" \
--splits 1.0 0.0 0.0 \
--save_dir ./data/final_data/ \
--negative_ratio 3
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/dev.json \
--task_type "ext" \
--splits 0.0 1.0 0.0 \
--save_dir ./data/final_data/ \
--negative_ratio 0
python finetune_step2_doccano.py \
--doccano_file ./data/mid_data/test.json \
--task_type "ext" \
--splits 0.0 0.0 1.0 \
--save_dir ./data/final_data/ \
--negative_ratio 0
# finetune训练
python finetune_step3_train.py
\ No newline at end of file
# 数据转换1
import os
import re
import json
en2ch = {
'ORG':'机构',
'PER':'人名',
'LOC':'籍贯'
}
def preprocess(input_path, save_path, mode):
if not os.path.exists(save_path):
os.makedirs(save_path)
data_path = os.path.join(save_path, mode + ".json")
result = []
tmp = {}
tmp['id'] = 0
tmp['text'] = ''
tmp['relations'] = []
tmp['entities'] = []
# =======先找出句子和句子中的所有实体和类型=======
with open(input_path,'r',encoding='utf-8') as fp:
lines = fp.readlines()
texts = []
entities = []
words = []
entity_tmp = []
entities_tmp = []
entity_label = ''
for line in lines:
line = line.strip().split(" ")
if len(line) == 2:
word = line[0]
label = line[1]
words.append(word)
if "B-" in label:
entity_tmp.append(word)
entity_label = en2ch[label.split("-")[-1]]
elif "I-" in label:
entity_tmp.append(word)
if (label == 'O') and entity_tmp:
if ("".join(entity_tmp), entity_label) not in entities_tmp:
entities_tmp.append(("".join(entity_tmp), entity_label))
entity_tmp, entity_label = [], ''
else:
if entity_tmp and (("".join(entity_tmp), entity_label) not in entities_tmp):
entities_tmp.append(("".join(entity_tmp), entity_label))
entity_tmp, entity_label = [], ''
texts.append("".join(words))
entities.append(entities_tmp)
words = []
entities_tmp = []
# ==========================================
# =======找出句子中实体的位置=======
i = 0
for text,entity in zip(texts, entities):
if entity:
ltmp = []
for ent,type in entity:
for span in re.finditer(ent, text):
start = span.start()
end = span.end()
ltmp.append((type, start, end, ent))
# print(ltmp)
ltmp = sorted(ltmp, key=lambda x:(x[1],x[2]))
for j in range(len(ltmp)):
# tmp['entities'].append(["".format(str(j)), ltmp[j][0], ltmp[j][1], ltmp[j][2], ltmp[j][3]])
tmp['entities'].append({"id":j, "start_offset":ltmp[j][1], "end_offset":ltmp[j][2], "label":ltmp[j][0]})
else:
tmp['entities'] = []
tmp['id'] = i
tmp['text'] = text
result.append(tmp)
tmp = {}
tmp['id'] = 0
tmp['text'] = ''
tmp['relations'] = []
tmp['entities'] = []
i += 1
with open(data_path, 'w', encoding='utf-8') as fp:
fp.write("\n".join([json.dumps(i, ensure_ascii=False) for i in result]))
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train", './data/mid_data', "train")
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.dev", './data/mid_data', "dev")
preprocess("F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.test", './data/mid_data', "test")
\ No newline at end of file
# 数据生成step2
import os
import time
import argparse
import json
from decimal import Decimal
import numpy as np
from bert4torch.snippets import seed_everything
from utils import convert_ext_examples, convert_cls_examples, logger
def do_convert():
seed_everything(args.seed)
tic_time = time.time()
if not os.path.exists(args.doccano_file):
raise ValueError("Please input the correct path of doccano file.")
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
if len(args.splits) != 0 and len(args.splits) != 3:
raise ValueError("Only []/ len(splits)==3 accepted for splits.")
def _check_sum(splits):
return Decimal(str(splits[0])) + Decimal(str(splits[1])) + Decimal(
str(splits[2])) == Decimal("1")
if len(args.splits) == 3 and not _check_sum(args.splits):
raise ValueError(
"Please set correct splits, sum of elements in splits should be equal to 1."
)
with open(args.doccano_file, "r", encoding="utf-8") as f:
raw_examples = f.readlines()
def _create_ext_examples(examples,
negative_ratio=0,
shuffle=False,
is_train=True):
entities, relations = convert_ext_examples(
examples, negative_ratio, is_train=is_train)
examples = entities + relations
if shuffle:
indexes = np.random.permutation(len(examples))
examples = [examples[i] for i in indexes]
return examples
def _create_cls_examples(examples, prompt_prefix, options, shuffle=False):
examples = convert_cls_examples(examples, prompt_prefix, options)
if shuffle:
indexes = np.random.permutation(len(examples))
examples = [examples[i] for i in indexes]
return examples
def _save_examples(save_dir, file_name, examples):
count = 0
save_path = os.path.join(save_dir, file_name)
if not examples:
logger.info("Skip saving %d examples to %s." % (0, save_path))
return
with open(save_path, "w", encoding="utf-8") as f:
for example in examples:
f.write(json.dumps(example, ensure_ascii=False) + "\n")
count += 1
logger.info("Save %d examples to %s." % (count, save_path))
if len(args.splits) == 0:
if args.task_type == "ext":
examples = _create_ext_examples(raw_examples, args.negative_ratio,
args.is_shuffle)
else:
examples = _create_cls_examples(raw_examples, args.prompt_prefix,
args.options, args.is_shuffle)
_save_examples(args.save_dir, "train.txt", examples)
else:
if args.is_shuffle:
indexes = np.random.permutation(len(raw_examples))
raw_examples = [raw_examples[i] for i in indexes]
i1, i2, _ = args.splits
p1 = int(len(raw_examples) * i1)
p2 = int(len(raw_examples) * (i1 + i2))
if args.task_type == "ext":
train_examples = _create_ext_examples(
raw_examples[:p1], args.negative_ratio, args.is_shuffle)
dev_examples = _create_ext_examples(
raw_examples[p1:p2], -1, is_train=False)
test_examples = _create_ext_examples(
raw_examples[p2:], -1, is_train=False)
else:
train_examples = _create_cls_examples(
raw_examples[:p1], args.prompt_prefix, args.options)
dev_examples = _create_cls_examples(
raw_examples[p1:p2], args.prompt_prefix, args.options)
test_examples = _create_cls_examples(
raw_examples[p2:], args.prompt_prefix, args.options)
_save_examples(args.save_dir, "train.txt", train_examples)
_save_examples(args.save_dir, "dev.txt", dev_examples)
_save_examples(args.save_dir, "test.txt", test_examples)
logger.info('Finished! It takes %.2f seconds' % (time.time() - tic_time))
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--doccano_file", default="./data/doccano.json",
type=str, help="The doccano file exported from doccano platform.")
parser.add_argument("-s", "--save_dir", default="./data",
type=str, help="The path of data that you wanna save.")
parser.add_argument("--negative_ratio", default=5, type=int,
help="Used only for the extraction task, the ratio of positive and negative samples, number of negtive samples = negative_ratio * number of positive samples")
parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*",
help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60%% samples used for training, 20%% for evaluation and 20%% for test.")
parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str,
help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")
parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+",
help="Used only for the classification task, the options for classification")
parser.add_argument("--prompt_prefix", default="情感倾向", type=str,
help="Used only for the classification task, the prompt prefix for classification")
parser.add_argument("--is_shuffle", default=True, type=bool,
help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--seed", type=int, default=1000,
help="random seed for initialization")
args = parser.parse_args()
do_convert()
import torch
from torch.utils.data import DataLoader
from model import uie_model, tokenizer
from bert4torch.snippets import seed_everything, sequence_padding, Callback
from torch import nn
from torch.utils.data import Dataset
import numpy as np
import json
from utils import get_bool_ids_greater_than, get_span
from random import sample
batch_size = 16
learning_rate = 1e-5
train_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/train.txt'
dev_path = 'E:/Github/bert4torch/examples/sequence_labeling/uie/data/final_data/dev.txt'
save_dir = './'
max_seq_len = 256
num_epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
uie_model.to(device)
class IEDataset(Dataset):
"""信息抽取
"""
def __init__(self, file_path, tokenizer, max_seq_len, fewshot=None) -> None:
super().__init__()
self.file_path = file_path
if fewshot is None:
self.dataset = list(self.reader(file_path))
else:
assert isinstance(fewshot, int)
self.dataset = sample(list(self.reader(file_path)), fewshot)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
return self.dataset[index]
@staticmethod
def reader(data_path, max_seq_len=512):
"""read json
"""
with open(data_path, 'r', encoding='utf-8') as f:
for line in f:
json_line = json.loads(line)
content = json_line['content']
prompt = json_line['prompt']
# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
# It include three summary tokens.
if max_seq_len <= len(prompt) + 3:
raise ValueError("The value of max_seq_len is too small, please set a larger value")
max_content_len = max_seq_len - len(prompt) - 3
if len(content) <= max_content_len:
yield json_line
else:
result_list = json_line['result_list']
json_lines = []
accumulate = 0
while True:
cur_result_list = []
for result in result_list:
if result['start'] + 1 <= max_content_len < result['end']:
max_content_len = result['start']
break
cur_content = content[:max_content_len]
res_content = content[max_content_len:]
while True:
if len(result_list) == 0:
break
elif result_list[0]['end'] <= max_content_len:
if result_list[0]['end'] > 0:
cur_result = result_list.pop(0)
cur_result_list.append(cur_result)
else:
cur_result_list = [result for result in result_list]
break
else:
break
json_line = {'content': cur_content, 'result_list': cur_result_list, 'prompt': prompt}
json_lines.append(json_line)
for result in result_list:
if result['end'] <= 0:
break
result['start'] -= max_content_len
result['end'] -= max_content_len
accumulate += max_content_len
max_content_len = max_seq_len - len(prompt) - 3
if len(res_content) == 0:
break
elif len(res_content) < max_content_len:
json_line = {'content': res_content, 'result_list': result_list, 'prompt': prompt}
json_lines.append(json_line)
break
else:
content = res_content
for json_line in json_lines:
yield json_line
def collate_fn(batch):
"""example: {title, prompt, content, result_list}
"""
batch_token_ids, batch_token_type_ids, batch_start_ids, batch_end_ids = [], [], [], []
for example in batch:
token_ids, token_type_ids, offset_mapping = tokenizer.encode(example["prompt"], example["content"],
maxlen=max_seq_len, return_offsets='transformers')
bias = 0
for index in range(len(offset_mapping)):
if index == 0:
continue
mapping = offset_mapping[index]
if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
bias = index
if mapping[0] == 0 and mapping[1] == 0:
continue
offset_mapping[index][0] += bias
offset_mapping[index][1] += bias
start_ids = [0 for _ in range(len(token_ids))]
end_ids = [0 for _ in range(len(token_ids))]
for item in example["result_list"]:
start = map_offset(item["start"] + bias, offset_mapping)
end = map_offset(item["end"] - 1 + bias, offset_mapping)
start_ids[start] = 1.0
end_ids[end] = 1.0
batch_token_ids.append(token_ids)
batch_token_type_ids.append(token_type_ids)
batch_start_ids.append(start_ids)
batch_end_ids.append(end_ids)
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_token_type_ids = torch.tensor(sequence_padding(batch_token_type_ids), dtype=torch.long, device=device)
batch_start_ids = torch.tensor(sequence_padding(batch_start_ids), dtype=torch.float, device=device)
batch_end_ids = torch.tensor(sequence_padding(batch_end_ids), dtype=torch.float, device=device)
return [batch_token_ids, batch_token_type_ids], [batch_start_ids, batch_end_ids]
def map_offset(ori_offset, offset_mapping):
"""map ori offset to token offset
"""
for index, span in enumerate(offset_mapping):
if span[0] <= ori_offset < span[1]:
return index
return -1
# 数据准备
train_ds = IEDataset(train_path, tokenizer=tokenizer, max_seq_len=max_seq_len, fewshot=None)
dev_ds = IEDataset(dev_path, tokenizer=tokenizer, max_seq_len=max_seq_len)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(dev_ds, batch_size=batch_size, collate_fn=collate_fn)
class MyLoss(nn.Module):
def forward(self, y_pred, y_true):
start_prob, end_prob = y_pred
start_ids, end_ids = y_true
loss_start = torch.nn.functional.binary_cross_entropy(start_prob, start_ids)
loss_end = torch.nn.functional.binary_cross_entropy(end_prob, end_ids)
return loss_start + loss_end
uie_model.compile(
loss=MyLoss(),
optimizer=torch.optim.AdamW(lr=learning_rate, params=uie_model.parameters()),
)
class SpanEvaluator(Callback):
"""SpanEvaluator computes the precision, recall and F1-score for span detection.
"""
def __init__(self):
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
self.best_val_f1 = 0
def on_epoch_end(self, steps, epoch, logs=None):
f1, precision, recall = self.evaluate(valid_dataloader)
if f1 > self.best_val_f1:
self.best_val_f1 = f1
# model.save_weights('best_model.pt')
print(f'[val-entity level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
def evaluate(self, dataloder):
self.reset()
for x_true, y_true in dataloder:
start_prob, end_prob = uie_model.predict(*x_true)
start_ids, end_ids = y_true
num_correct, num_infer, num_label = self.compute(start_prob, end_prob, start_ids, end_ids)
self.update(num_correct, num_infer, num_label)
precision, recall, f1 = self.accumulate()
return f1, precision, recall
def compute(self, start_probs, end_probs, gold_start_ids, gold_end_ids):
"""Computes the precision, recall and F1-score for span detection.
"""
start_probs = start_probs.cpu().numpy()
end_probs = end_probs.cpu().numpy()
gold_start_ids = gold_start_ids.cpu().numpy()
gold_end_ids = gold_end_ids.cpu().numpy()
pred_start_ids = get_bool_ids_greater_than(start_probs)
pred_end_ids = get_bool_ids_greater_than(end_probs)
gold_start_ids = get_bool_ids_greater_than(gold_start_ids.tolist())
gold_end_ids = get_bool_ids_greater_than(gold_end_ids.tolist())
num_correct_spans = 0
num_infer_spans = 0
num_label_spans = 0
for predict_start_ids, predict_end_ids, label_start_ids, label_end_ids in zip(
pred_start_ids, pred_end_ids, gold_start_ids, gold_end_ids):
[_correct, _infer, _label] = self.eval_span(predict_start_ids, predict_end_ids, label_start_ids, label_end_ids)
num_correct_spans += _correct
num_infer_spans += _infer
num_label_spans += _label
return num_correct_spans, num_infer_spans, num_label_spans
def update(self, num_correct_spans, num_infer_spans, num_label_spans):
"""
This function takes (num_infer_spans, num_label_spans, num_correct_spans) as input,
to accumulate and update the corresponding status of the SpanEvaluator object.
"""
self.num_infer_spans += num_infer_spans
self.num_label_spans += num_label_spans
self.num_correct_spans += num_correct_spans
def eval_span(self, predict_start_ids, predict_end_ids, label_start_ids,
label_end_ids):
"""
evaluate position extraction (start, end)
return num_correct, num_infer, num_label
input: [1, 2, 10] [4, 12] [2, 10] [4, 11]
output: (1, 2, 2)
"""
pred_set = get_span(predict_start_ids, predict_end_ids)
label_set = get_span(label_start_ids, label_end_ids)
num_correct = len(pred_set & label_set)
num_infer = len(pred_set)
num_label = len(label_set)
return (num_correct, num_infer, num_label)
def accumulate(self):
"""
This function returns the mean precision, recall and f1 score for all accumulated minibatches.
Returns:
tuple: Returns tuple (`precision, recall, f1 score`).
"""
precision = float(self.num_correct_spans / self.num_infer_spans) if self.num_infer_spans else 0.
recall = float(self.num_correct_spans / self.num_label_spans) if self.num_label_spans else 0.
f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_spans else 0.
return precision, recall, f1_score
def reset(self):
"""
Reset function empties the evaluation memory for previous mini-batches.
"""
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
if __name__ == "__main__":
evaluator = SpanEvaluator()
print('zero_shot performance: ', evaluator.evaluate(valid_dataloader))
uie_model.fit(train_dataloader, epochs=num_epochs, steps_per_epoch=None, callbacks=[evaluator])
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything
from bert4torch.losses import FocalLoss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel, BERT
from tqdm import tqdm
config_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/uie/uie_base_pytorch/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class UIE(BERT):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hidden_size = self.hidden_size
self.linear_start = nn.Linear(hidden_size, 1)
self.linear_end = nn.Linear(hidden_size, 1)
self.sigmoid = nn.Sigmoid()
if kwargs.get('use_task_id') and kwargs.get('use_task_id'):
# Add task type embedding to BERT
task_type_embeddings = nn.Embedding(kwargs.get('task_type_vocab_size'), self.hidden_size)
self.embeddings.task_type_embeddings = task_type_embeddings
def hook(module, input, output):
return output+task_type_embeddings(torch.zeros(input[0].size(), dtype=torch.int64, device=input[0].device))
self.embeddings.word_embeddings.register_forward_hook(hook)
def forward(self, token_ids, token_type_ids):
outputs = super().forward([token_ids, token_type_ids])
sequence_output = outputs[0]
start_logits = self.linear_start(sequence_output)
start_logits = torch.squeeze(start_logits, -1)
start_prob = self.sigmoid(start_logits)
end_logits = self.linear_end(sequence_output)
end_logits = torch.squeeze(end_logits, -1)
end_prob = self.sigmoid(end_logits)
return start_prob, end_prob
@torch.no_grad()
def predict(self, token_ids, token_type_ids):
self.eval()
start_prob, end_prob = self.forward(token_ids, token_type_ids)
return start_prob, end_prob
uie_model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=UIE, with_pool=True)
\ No newline at end of file
import numpy as np
import math
import torch
from bert4torch.snippets import sequence_padding
from utils import get_bool_ids_greater_than, get_span, get_id_and_prob, cut_chinese_sent, dbc2sbc
from pprint import pprint
import torch.nn.functional as F
class UIEPredictor(object):
def __init__(self, schema, device='cpu', position_prob=0.5, max_seq_len=512, batch_size=64, split_sentence=False):
self._device = device
self._position_prob = position_prob
self._max_seq_len = max_seq_len
self._batch_size = 64
self._split_sentence = False
self._schema_tree = None
self.set_schema(schema)
from model import uie_model, tokenizer
self._tokenizer = tokenizer
self.model = uie_model.to(self._device)
def set_schema(self, schema):
if isinstance(schema, dict) or isinstance(schema, str):
schema = [schema]
self._schema_tree = self._build_tree(schema)
def __call__(self, inputs):
texts = inputs
texts = [texts] if isinstance(texts, str) else texts
results = self._multi_stage_predict(texts)
return results
def _multi_stage_predict(self, datas):
"""构建schema tree和预测
"""
results = [{} for _ in range(len(datas))]
# input check to early return
if len(datas) < 1 or self._schema_tree is None:
return results
# copy to stay `self._schema_tree` unchanged
schema_list = self._schema_tree.children[:]
while len(schema_list) > 0:
node = schema_list.pop(0)
examples = []
input_map = {}
cnt = 0
idx = 0
if not node.prefix:
for data in datas:
examples.append({"text": data, "prompt": dbc2sbc(node.name)})
input_map[cnt] = [idx]
idx += 1
cnt += 1
else:
for pre, data in zip(node.prefix, datas):
if len(pre) == 0:
input_map[cnt] = []
else:
for p in pre:
examples.append({ "text": data, "prompt": dbc2sbc(p + node.name)})
input_map[cnt] = [i + idx for i in range(len(pre))]
idx += len(pre)
cnt += 1
if len(examples) == 0:
result_list = []
else:
result_list = self._single_stage_predict(examples)
if not node.parent_relations:
relations = [[] for i in range(len(datas))]
for k, v in input_map.items():
for idx in v:
if len(result_list[idx]) == 0:
continue
if node.name not in results[k].keys():
results[k][node.name] = result_list[idx]
else:
results[k][node.name].extend(result_list[idx])
if node.name in results[k].keys():
relations[k].extend(results[k][node.name])
else:
relations = node.parent_relations
for k, v in input_map.items():
for i in range(len(v)):
if len(result_list[v[i]]) == 0:
continue
if "relations" not in relations[k][i].keys():
relations[k][i]["relations"] = {
node.name: result_list[v[i]]
}
elif node.name not in relations[k][i]["relations"].keys(
):
relations[k][i]["relations"][
node.name] = result_list[v[i]]
else:
relations[k][i]["relations"][node.name].extend(
result_list[v[i]])
new_relations = [[] for i in range(len(datas))]
for i in range(len(relations)):
for j in range(len(relations[i])):
if "relations" in relations[i][j].keys(
) and node.name in relations[i][j]["relations"].keys():
for k in range(
len(relations[i][j]["relations"][
node.name])):
new_relations[i].append(relations[i][j][
"relations"][node.name][k])
relations = new_relations
prefix = [[] for _ in range(len(datas))]
for k, v in input_map.items():
for idx in v:
for i in range(len(result_list[idx])):
prefix[k].append(result_list[idx][i]["text"] + "的")
for child in node.children:
child.prefix = prefix
child.parent_relations = relations
schema_list.append(child)
return results
def _convert_ids_to_results(self, examples, sentence_ids, probs):
"""
Convert ids to raw text in a single stage.
"""
results = []
for example, sentence_id, prob in zip(examples, sentence_ids, probs):
if len(sentence_id) == 0:
results.append([])
continue
result_list = []
text = example["text"]
prompt = example["prompt"]
for i in range(len(sentence_id)):
start, end = sentence_id[i]
if start < 0 and end >= 0:
continue
if end < 0:
start += (len(prompt) + 1)
end += (len(prompt) + 1)
result = {"text": prompt[start:end],
"probability": prob[i]}
result_list.append(result)
else:
result = {
"text": text[start:end],
"start": start,
"end": end,
"probability": prob[i]
}
result_list.append(result)
results.append(result_list)
return results
def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
'''
Split the raw texts automatically for model inference.
Args:
input_texts (List[str]): input raw texts.
max_text_len (int): cutting length.
split_sentence (bool): If True, sentence-level split will be performed.
return:
short_input_texts (List[str]): the short input texts for model inference.
input_mapping (dict): mapping between raw text and short input texts.
'''
input_mapping = {}
short_input_texts = []
cnt_org = 0
cnt_short = 0
for text in input_texts:
if not split_sentence:
sens = [text]
else:
sens = cut_chinese_sent(text)
for sen in sens:
lens = len(sen)
if lens <= max_text_len:
short_input_texts.append(sen)
if cnt_org not in input_mapping.keys():
input_mapping[cnt_org] = [cnt_short]
else:
input_mapping[cnt_org].append(cnt_short)
cnt_short += 1
else:
temp_text_list = [sen[i:i + max_text_len] for i in range(0, lens, max_text_len)]
short_input_texts.extend(temp_text_list)
short_idx = cnt_short
cnt_short += math.ceil(lens / max_text_len)
temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
if cnt_org not in input_mapping.keys():
input_mapping[cnt_org] = temp_text_id
else:
input_mapping[cnt_org].extend(temp_text_id)
cnt_org += 1
return short_input_texts, input_mapping
def _single_stage_predict(self, inputs):
input_texts = []
prompts = []
for i in range(len(inputs)):
input_texts.append(inputs[i]["text"])
prompts.append(inputs[i]["prompt"])
# max predict length should exclude the length of prompt and summary tokens
max_predict_len = self._max_seq_len - len(max(prompts)) - 3
short_input_texts, self.input_mapping = self._auto_splitter(input_texts, max_predict_len, split_sentence=self._split_sentence)
short_texts_prompts = []
for k, v in self.input_mapping.items():
short_texts_prompts.extend([prompts[k] for i in range(len(v))])
short_inputs = [{"text": short_input_texts[i], "prompt": short_texts_prompts[i]} for i in range(len(short_input_texts))]
token_ids, segment_ids, offset_maps = self._tokenizer.encode(short_texts_prompts, short_input_texts, maxlen=self._max_seq_len, return_offsets='transformers')
start_prob_concat, end_prob_concat = [], []
for batch_start in range(0, len(short_input_texts), self._batch_size):
batch_token_ids = token_ids[batch_start:batch_start+self._batch_size]
batch_segment_ids = segment_ids[batch_start:batch_start+self._batch_size]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=self._device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=self._device)
start_prob, end_prob = self.model.predict(batch_token_ids, batch_segment_ids)
start_prob_concat.append(start_prob.cpu().numpy())
end_prob_concat.append(end_prob.cpu().numpy())
start_prob_concat = np.concatenate(start_prob_concat)
end_prob_concat = np.concatenate(end_prob_concat)
start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=self._position_prob, return_prob=True)
end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=self._position_prob, return_prob=True)
sentence_ids = []
probs = []
for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, token_ids, offset_maps):
for i in reversed(range(len(ids))):
if ids[i] != 0:
ids = ids[:i]
break
span_list = get_span(start_ids, end_ids, with_prob=True)
sentence_id, prob = get_id_and_prob(span_list, offset_map)
sentence_ids.append(sentence_id)
probs.append(prob)
results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
results = self._auto_joiner(results, short_input_texts, self.input_mapping)
return results
def _auto_joiner(self, short_results, short_inputs, input_mapping):
concat_results = []
is_cls_task = False
for short_result in short_results:
if short_result == []:
continue
elif 'start' not in short_result[0].keys(
) and 'end' not in short_result[0].keys():
is_cls_task = True
break
else:
break
for k, vs in input_mapping.items():
if is_cls_task:
cls_options = {}
single_results = []
for v in vs:
if len(short_results[v]) == 0:
continue
if short_results[v][0]['text'] not in cls_options.keys():
cls_options[short_results[v][0][
'text']] = [1, short_results[v][0]['probability']]
else:
cls_options[short_results[v][0]['text']][0] += 1
cls_options[short_results[v][0]['text']][
1] += short_results[v][0]['probability']
if len(cls_options) != 0:
cls_res, cls_info = max(cls_options.items(),
key=lambda x: x[1])
concat_results.append([{
'text': cls_res,
'probability': cls_info[1] / cls_info[0]
}])
else:
concat_results.append([])
else:
offset = 0
single_results = []
for v in vs:
if v == 0:
single_results = short_results[v]
offset += len(short_inputs[v])
else:
for i in range(len(short_results[v])):
if 'start' not in short_results[v][
i] or 'end' not in short_results[v][i]:
continue
short_results[v][i]['start'] += offset
short_results[v][i]['end'] += offset
offset += len(short_inputs[v])
single_results.extend(short_results[v])
concat_results.append(single_results)
return concat_results
def predict(self, input_data):
results = self._multi_stage_predict(input_data)
return results
@classmethod
def _build_tree(cls, schema, name='root'):
"""
Build the schema tree.
"""
schema_tree = SchemaTree(name)
for s in schema:
if isinstance(s, str):
schema_tree.add_child(SchemaTree(s))
elif isinstance(s, dict):
for k, v in s.items():
if isinstance(v, str):
child = [v]
elif isinstance(v, list):
child = v
else:
raise TypeError("Invalid schema, value for each key:value pairs should be list or string but {} received".format(type(v)))
schema_tree.add_child(cls._build_tree(child, name=k))
else:
raise TypeError("Invalid schema, element should be string or dict, but {} received".format(type(s)))
return schema_tree
class SchemaTree(object):
"""SchemaTree的实现
"""
def __init__(self, name='root', children=None):
self.name = name
self.children = []
self.prefix = None
self.parent_relations = None
if children is not None:
for child in children:
self.add_child(child)
def __repr__(self):
return self.name
def add_child(self, node):
assert isinstance(node, SchemaTree), "The children of a node should be an instacne of SchemaTree."
self.children.append(node)
if __name__ == '__main__':
# 命名实体识别
schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
ie = UIEPredictor(schema=schema)
pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!"))
schema = ['肿瘤的大小', '肿瘤的个数', '肝癌级别', '脉管内癌栓分级']
ie.set_schema(schema)
pprint(ie("(右肝肿瘤)肝细胞性肝癌(II-III级,梁索型和假腺管型),肿瘤包膜不完整,紧邻肝被膜,侵及周围肝组织,未见脉管内癌栓(MVI分级:M0级)及卫星子灶形成。(肿物1个,大小4.2×4.0×2.8cm)。"))
# 关系抽取
schema = {'竞赛名称': ['主办方', '承办方', '已举办次数']}
ie.set_schema(schema) # Reset schema
pprint(ie('2022语言与智能技术竞赛由中国中文信息学会和中国计算机学会联合主办,百度公司、中国中文信息学会评测工作委员会和中国计算机学会自然语言处理专委会承办,已连续举办4届,成为全球最热门的中文NLP赛事之一。'))
# 事件抽取
schema = {'地震触发词': ['地震强度', '时间', '震中位置', '震源深度']}
ie.set_schema(schema) # Reset schema
ie('中国地震台网正式测定:5月16日06时08分在云南临沧市凤庆县(北纬24.34度,东经99.98度)发生3.5级地震,震源深度10千米。')
# 评论观点抽取
schema = {'评价维度': ['观点词', '情感倾向[正向,负向]']}
ie.set_schema(schema) # Reset schema
pprint(ie("店面干净,很清静,服务员服务热情,性价比很高,发现收银台有排队"))
# 情感倾向分类
schema = '情感倾向[正向,负向]'
ie.set_schema(schema)
ie('这个产品用起来真的很流畅,我非常喜欢')
\ No newline at end of file
import contextlib
import functools
import json
import logging
import math
import random
import re
import shutil
import threading
import time
from functools import partial
import colorlog
import numpy as np
import torch
from colorama import Back, Fore
from tqdm import tqdm
loggers = {}
log_config = {
'DEBUG': {'level': 10, 'color': 'purple'},
'INFO': {'level': 20, 'color': 'green'},
'TRAIN': {'level': 21, 'color': 'cyan'},
'EVAL': {'level': 22, 'color': 'blue'},
'WARNING': {'level': 30, 'color': 'yellow'},
'ERROR': {'level': 40, 'color': 'red'},
'CRITICAL': {'level': 50, 'color': 'bold_red'}
}
def get_span(start_ids, end_ids, with_prob=False):
"""
Get span set from position start and end list.
Args:
start_ids (List[int]/List[tuple]): The start index list.
end_ids (List[int]/List[tuple]): The end index list.
with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
Returns:
set: The span set without overlapping, every id can only be used once .
"""
if with_prob:
start_ids = sorted(start_ids, key=lambda x: x[0])
end_ids = sorted(end_ids, key=lambda x: x[0])
else:
start_ids = sorted(start_ids)
end_ids = sorted(end_ids)
start_pointer = 0
end_pointer = 0
len_start = len(start_ids)
len_end = len(end_ids)
couple_dict = {}
while start_pointer < len_start and end_pointer < len_end:
if with_prob:
start_id = start_ids[start_pointer][0]
end_id = end_ids[end_pointer][0]
else:
start_id = start_ids[start_pointer]
end_id = end_ids[end_pointer]
if start_id == end_id:
couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
start_pointer += 1
end_pointer += 1
continue
if start_id < end_id:
couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
start_pointer += 1
continue
if start_id > end_id:
end_pointer += 1
continue
result = [(couple_dict[end], end) for end in couple_dict]
result = set(result)
return result
def get_bool_ids_greater_than(probs, limit=0.5, return_prob=False):
"""
Get idx of the last dimension in probability arrays, which is greater than a limitation.
Args:
probs (List[List[float]]): The input probability arrays.
limit (float): The limitation for probability.
return_prob (bool): Whether to return the probability
Returns:
List[List[int]]: The index of the last dimension meet the conditions.
"""
probs = np.array(probs)
dim_len = len(probs.shape)
if dim_len > 1:
result = []
for p in probs:
result.append(get_bool_ids_greater_than(p, limit, return_prob))
return result
else:
result = []
for i, p in enumerate(probs):
if p > limit:
if return_prob:
result.append((i, p))
else:
result.append(i)
return result
class Logger(object):
'''
Deafult logger in UIE
Args:
name(str) : Logger name, default is 'UIE'
'''
def __init__(self, name: str = None):
name = 'UIE' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
logging.addLevelName(conf['level'], key)
self.__dict__[key] = functools.partial(
self.__call__, conf['level'])
self.__dict__[key.lower()] = functools.partial(
self.__call__, conf['level'])
self.format = colorlog.ColoredFormatter(
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
log_colors={key: conf['color']
for key, conf in log_config.items()})
self.handler = logging.StreamHandler()
self.handler.setFormatter(self.format)
self.logger.addHandler(self.handler)
self.logLevel = 'DEBUG'
self.logger.setLevel(logging.DEBUG)
self.logger.propagate = False
self._is_enable = True
def disable(self):
self._is_enable = False
def enable(self):
self._is_enable = True
@property
def is_enable(self) -> bool:
return self._is_enable
def __call__(self, log_level: str, msg: str):
if not self.is_enable:
return
self.logger.log(log_level, msg)
@contextlib.contextmanager
def use_terminator(self, terminator: str):
old_terminator = self.handler.terminator
self.handler.terminator = terminator
yield
self.handler.terminator = old_terminator
@contextlib.contextmanager
def processing(self, msg: str, interval: float = 0.1):
'''
Continuously print a progress bar with rotating special effects.
Args:
msg(str): Message to be printed.
interval(float): Rotation interval. Default to 0.1.
'''
end = False
def _printer():
index = 0
flags = ['\\', '|', '/', '-']
while not end:
flag = flags[index % len(flags)]
with self.use_terminator('\r'):
self.info('{}: {}'.format(msg, flag))
time.sleep(interval)
index += 1
t = threading.Thread(target=_printer)
t.start()
yield
end = True
logger = Logger()
BAR_FORMAT = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET} {Fore.GREEN}{{n_fmt}}/{{total_fmt}} {Fore.RED}{{rate_fmt}}{{postfix}}{Fore.RESET} eta {Fore.CYAN}{{remaining}}{Fore.RESET}'
BAR_FORMAT_NO_TIME = f'{{desc}}: {Fore.GREEN}{{percentage:3.0f}}%{Fore.RESET} {Fore.BLUE}{{bar}}{Fore.RESET} {Fore.GREEN}{{n_fmt}}/{{total_fmt}}{Fore.RESET}'
BAR_TYPE = [
"░▝▗▖▘▚▞▛▙█",
"░▖▘▝▗▚▞█",
" ▖▘▝▗▚▞█",
"░▒█",
" >=",
" ▏▎▍▌▋▊▉█"
"░▏▎▍▌▋▊▉█"
]
tqdm = partial(tqdm, bar_format=BAR_FORMAT, ascii=BAR_TYPE[0], leave=False)
def get_id_and_prob(spans, offset_map):
prompt_length = 0
for i in range(1, len(offset_map)):
if offset_map[i] != [0, 0]:
prompt_length += 1
else:
break
for i in range(1, prompt_length + 1):
offset_map[i][0] -= (prompt_length + 1)
offset_map[i][1] -= (prompt_length + 1)
sentence_id = []
prob = []
for start, end in spans:
prob.append(start[1] * end[1])
sentence_id.append(
(offset_map[start[0]][0], offset_map[end[0]][1]))
return sentence_id, prob
def cut_chinese_sent(para):
"""
Cut the Chinese sentences more precisely, reference to
"https://blog.csdn.net/blmoistawinde/article/details/82379256".
"""
para = re.sub(r'([。!?\?])([^”’])', r'\1\n\2', para)
para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
para = re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip()
return para.split("\n")
def dbc2sbc(s):
rs = ""
for char in s:
code = ord(char)
if code == 0x3000:
code = 0x0020
else:
code -= 0xfee0
if not (0x0021 <= code and code <= 0x7e):
rs += char
continue
rs += chr(code)
return rs
def convert_cls_examples(raw_examples, prompt_prefix, options):
examples = []
logger.info(f"Converting doccano data...")
with tqdm(total=len(raw_examples)) as pbar:
for line in raw_examples:
items = json.loads(line)
# Compatible with doccano >= 1.6.2
if "data" in items.keys():
text, labels = items["data"], items["label"]
else:
text, labels = items["text"], items["label"]
random.shuffle(options)
prompt = ""
sep = ","
for option in options:
prompt += option
prompt += sep
prompt = prompt_prefix + "[" + prompt.rstrip(sep) + "]"
result_list = []
example = {
"content": text,
"result_list": result_list,
"prompt": prompt
}
for label in labels:
start = prompt.rfind(label[0]) - len(prompt) - 1
end = start + len(label)
result = {"text": label, "start": start, "end": end}
example["result_list"].append(result)
examples.append(example)
return examples
def add_negative_example(examples, texts, prompts, label_set, negative_ratio):
negative_examples = []
positive_examples = []
with tqdm(total=len(prompts)) as pbar:
for i, prompt in enumerate(prompts):
negative_sample = []
redundants_list = list(set(label_set) ^ set(prompt))
redundants_list.sort()
num_positive = len(examples[i])
if num_positive != 0:
actual_ratio = math.ceil(len(redundants_list) / num_positive)
else:
# Set num_positive to 1 for text without positive example
num_positive, actual_ratio = 1, 0
if actual_ratio <= negative_ratio or negative_ratio == -1:
idxs = [k for k in range(len(redundants_list))]
else:
idxs = random.sample(
range(0, len(redundants_list)),
negative_ratio * num_positive)
for idx in idxs:
negative_result = {
"content": texts[i],
"result_list": [],
"prompt": redundants_list[idx]
}
negative_examples.append(negative_result)
positive_examples.extend(examples[i])
pbar.update(1)
return positive_examples, negative_examples
def add_full_negative_example(examples, texts, relation_prompts, predicate_set,
subject_goldens):
with tqdm(total=len(relation_prompts)) as pbar:
for i, relation_prompt in enumerate(relation_prompts):
negative_sample = []
for subject in subject_goldens[i]:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = subject + "的" + predicate
if prompt not in relation_prompt:
negative_result = {
"content": texts[i],
"result_list": [],
"prompt": prompt
}
negative_sample.append(negative_result)
examples[i].extend(negative_sample)
pbar.update(1)
return examples
def construct_relation_prompt_set(entity_name_set, predicate_set):
relation_prompt_set = set()
for entity_name in entity_name_set:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
relation_prompt = entity_name + "的" + predicate
relation_prompt_set.add(relation_prompt)
return sorted(list(relation_prompt_set))
def convert_ext_examples(raw_examples, negative_ratio, is_train=True):
texts = []
entity_examples = []
relation_examples = []
entity_prompts = []
relation_prompts = []
entity_label_set = []
entity_name_set = []
predicate_set = []
subject_goldens = []
logger.info(f"Converting doccano data...")
with tqdm(total=len(raw_examples)) as pbar:
for line in raw_examples:
items = json.loads(line)
entity_id = 0
if "data" in items.keys():
relation_mode = False
if isinstance(items["label"],
dict) and "entities" in items["label"].keys():
relation_mode = True
text = items["data"]
entities = []
if not relation_mode:
# Export file in JSONL format which doccano < 1.7.0
for item in items["label"]:
entity = {
"id": entity_id,
"start_offset": item[0],
"end_offset": item[1],
"label": item[2]
}
entities.append(entity)
entity_id += 1
else:
# Export file in JSONL format for relation labeling task which doccano < 1.7.0
for item in items["label"]["entities"]:
entity = {
"id": entity_id,
"start_offset": item["start_offset"],
"end_offset": item["end_offset"],
"label": item["label"]
}
entities.append(entity)
entity_id += 1
relations = []
else:
# Export file in JSONL format which doccano >= 1.7.0
if "label" in items.keys():
text = items["text"]
entities = []
for item in items["label"]:
entity = {
"id": entity_id,
"start_offset": item[0],
"end_offset": item[1],
"label": item[2]
}
entities.append(entity)
entity_id += 1
relations = []
else:
# Export file in JSONL (relation) format
text, relations, entities = items["text"], items[
"relations"], items["entities"]
texts.append(text)
entity_example = []
entity_prompt = []
entity_example_map = {}
entity_map = {} # id to entity name
for entity in entities:
entity_name = text[entity["start_offset"]:entity["end_offset"]]
entity_map[entity["id"]] = {
"name": entity_name,
"start": entity["start_offset"],
"end": entity["end_offset"]
}
entity_label = entity["label"]
result = {
"text": entity_name,
"start": entity["start_offset"],
"end": entity["end_offset"]
}
if entity_label not in entity_example_map.keys():
entity_example_map[entity_label] = {
"content": text,
"result_list": [result],
"prompt": entity_label
}
else:
entity_example_map[entity_label]["result_list"].append(
result)
if entity_label not in entity_label_set:
entity_label_set.append(entity_label)
if entity_name not in entity_name_set:
entity_name_set.append(entity_name)
entity_prompt.append(entity_label)
for v in entity_example_map.values():
entity_example.append(v)
entity_examples.append(entity_example)
entity_prompts.append(entity_prompt)
subject_golden = []
relation_example = []
relation_prompt = []
relation_example_map = {}
for relation in relations:
predicate = relation["type"]
subject_id = relation["from_id"]
object_id = relation["to_id"]
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = entity_map[subject_id]["name"] + "的" + predicate
if entity_map[subject_id]["name"] not in subject_golden:
subject_golden.append(entity_map[subject_id]["name"])
result = {
"text": entity_map[object_id]["name"],
"start": entity_map[object_id]["start"],
"end": entity_map[object_id]["end"]
}
if prompt not in relation_example_map.keys():
relation_example_map[prompt] = {
"content": text,
"result_list": [result],
"prompt": prompt
}
else:
relation_example_map[prompt]["result_list"].append(result)
if predicate not in predicate_set:
predicate_set.append(predicate)
relation_prompt.append(prompt)
for v in relation_example_map.values():
relation_example.append(v)
relation_examples.append(relation_example)
relation_prompts.append(relation_prompt)
subject_goldens.append(subject_golden)
pbar.update(1)
def concat_examples(positive_examples, negative_examples, negative_ratio):
examples = []
if math.ceil(len(negative_examples) /
len(positive_examples)) <= negative_ratio:
examples = positive_examples + negative_examples
else:
# Random sampling the negative examples to ensure overall negative ratio unchanged.
idxs = random.sample(
range(0, len(negative_examples)),
negative_ratio * len(positive_examples))
negative_examples_sampled = []
for idx in idxs:
negative_examples_sampled.append(negative_examples[idx])
examples = positive_examples + negative_examples_sampled
return examples
logger.info(f"Adding negative samples for first stage prompt...")
positive_examples, negative_examples = add_negative_example(
entity_examples, texts, entity_prompts, entity_label_set,
negative_ratio)
if len(positive_examples) == 0:
all_entity_examples = []
elif is_train:
all_entity_examples = concat_examples(positive_examples,
negative_examples, negative_ratio)
else:
all_entity_examples = positive_examples + negative_examples
all_relation_examples = []
if len(predicate_set) != 0:
if is_train:
logger.info(f"Adding negative samples for second stage prompt...")
relation_prompt_set = construct_relation_prompt_set(entity_name_set,
predicate_set)
positive_examples, negative_examples = add_negative_example(
relation_examples, texts, relation_prompts, relation_prompt_set,
negative_ratio)
all_relation_examples = concat_examples(
positive_examples, negative_examples, negative_ratio)
else:
logger.info(f"Adding negative samples for second stage prompt...")
relation_examples = add_full_negative_example(
relation_examples, texts, relation_prompts, predicate_set,
subject_goldens)
all_relation_examples = [
r
for r in relation_example
for relation_example in relation_examples
]
return all_entity_examples, all_relation_examples
def get_path_from_url(url,
root_dir,
check_exist=True,
decompress=True):
""" Download from given url to root_dir.
if file or directory specified by url is exists under
root_dir, return the path directly, otherwise download
from url and decompress it, return the path.
Args:
url (str): download url
root_dir (str): root dir for downloading, it should be
WEIGHTS_HOME or DATASET_HOME
decompress (bool): decompress zip or tar file. Default is `True`
Returns:
str: a local path to save downloaded models & weights & datasets.
"""
import os.path
import os
import tarfile
import zipfile
def is_url(path):
"""
Whether path is URL.
Args:
path (string): URL string or not.
"""
return path.startswith('http://') or path.startswith('https://')
def _map_path(url, root_dir):
# parse path after download under root_dir
fname = os.path.split(url)[-1]
fpath = fname
return os.path.join(root_dir, fpath)
def _get_download(url, fullname):
import requests
# using requests.get method
fname = os.path.basename(fullname)
try:
req = requests.get(url, stream=True)
except Exception as e: # requests.exceptions.ConnectionError
logger.info("Downloading {} from {} failed with exception {}".format(
fname, url, str(e)))
return False
if req.status_code != 200:
raise RuntimeError("Downloading from {} failed with code "
"{}!".format(url, req.status_code))
# For protecting download interupted, download to
# tmp_fullname firstly, move tmp_fullname to fullname
# after download finished
tmp_fullname = fullname + "_tmp"
total_size = req.headers.get('content-length')
with open(tmp_fullname, 'wb') as f:
if total_size:
with tqdm(total=(int(total_size) + 1023) // 1024, unit='KB') as pbar:
for chunk in req.iter_content(chunk_size=1024):
f.write(chunk)
pbar.update(1)
else:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
shutil.move(tmp_fullname, fullname)
return fullname
def _download(url, path):
"""
Download from url, save to path.
url (str): download url
path (str): download to given path
"""
if not os.path.exists(path):
os.makedirs(path)
fname = os.path.split(url)[-1]
fullname = os.path.join(path, fname)
retry_cnt = 0
logger.info("Downloading {} from {}".format(fname, url))
DOWNLOAD_RETRY_LIMIT = 3
while not os.path.exists(fullname):
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
retry_cnt += 1
else:
raise RuntimeError("Download from {} failed. "
"Retry limit reached".format(url))
if not _get_download(url, fullname):
time.sleep(1)
continue
return fullname
def _uncompress_file_zip(filepath):
with zipfile.ZipFile(filepath, 'r') as files:
file_list = files.namelist()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
# `strip(os.sep)` to remove `os.sep` in the tail of path
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _is_a_single_file(file_list):
if len(file_list) == 1 and file_list[0].find(os.sep) < 0:
return True
return False
def _is_a_single_dir(file_list):
new_file_list = []
for file_path in file_list:
if '/' in file_path:
file_path = file_path.replace('/', os.sep)
elif '\\' in file_path:
file_path = file_path.replace('\\', os.sep)
new_file_list.append(file_path)
file_name = new_file_list[0].split(os.sep)[0]
for i in range(1, len(new_file_list)):
if file_name != new_file_list[i].split(os.sep)[0]:
return False
return True
def _uncompress_file_tar(filepath, mode="r:*"):
with tarfile.open(filepath, mode) as files:
file_list = files.getnames()
file_dir = os.path.dirname(filepath)
if _is_a_single_file(file_list):
rootpath = file_list[0]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
elif _is_a_single_dir(file_list):
rootpath = os.path.splitext(file_list[0].strip(os.sep))[0].split(
os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
files.extractall(file_dir)
else:
rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
uncompressed_path = os.path.join(file_dir, rootpath)
if not os.path.exists(uncompressed_path):
os.makedirs(uncompressed_path)
files.extractall(os.path.join(file_dir, rootpath))
return uncompressed_path
def _decompress(fname):
"""
Decompress for zip and tar file
"""
logger.info("Decompressing {}...".format(fname))
# For protecting decompressing interupted,
# decompress to fpath_tmp directory firstly, if decompress
# successed, move decompress files to fpath and delete
# fpath_tmp and remove download compress file.
if tarfile.is_tarfile(fname):
uncompressed_path = _uncompress_file_tar(fname)
elif zipfile.is_zipfile(fname):
uncompressed_path = _uncompress_file_zip(fname)
else:
raise TypeError("Unsupport compress file type {}".format(fname))
return uncompressed_path
assert is_url(url), "downloading from {} not a url".format(url)
fullpath = _map_path(url, root_dir)
if os.path.exists(fullpath) and check_exist:
logger.info("Found {}".format(fullpath))
else:
fullpath = _download(url, root_dir)
if decompress and (tarfile.is_tarfile(fullpath) or
zipfile.is_zipfile(fullpath)):
fullpath = _decompress(fullpath)
return fullpath
#! -*- coding:utf-8 -*-
# DDP示例
# 启动命令:python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 task_distributed_data_parallel.py
import os
# 也可命令行传入
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModelDDP
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=-1)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
device = torch.device('cuda', args.local_rank)
torch.distributed.init_process_group(backend='nccl')
# 模型设置
maxlen = 256
batch_size = 16
config_path = '/datasets/bert-base-chinese/config.json'
checkpoint_path = '/datasets/bert-base-chinese/pytorch_model.bin'
dict_path = '/datasets/bert-base-chinese/vocab.txt'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class MyDataset(ListDataset):
@staticmethod
def load_data(filename):
D = []
with open(filename, encoding='utf-8') as f:
f = f.read()
for l in f.split('\n\n'):
if not l:
continue
d = ['']
for i, c in enumerate(l.split('\n')):
char, flag = c.split(' ')
d[0] += char
if flag[0] == 'B':
d.append([i, i, flag[2:]])
elif flag[0] == 'I':
d[-1][1] = i
D.append(d)
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
# 加载数据集
train_dataset = MyDataset('/datasets/bert-base-chinese/china-people-daily-ner-corpus/example.train')
train_sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构,这里loss并不是放在模型里计算的
class Model1(nn.Module):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, token_ids, segment_ids, labels):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
loss = self.loss_fn(output, labels)
return loss
class Model(BaseModel):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=lambda x, _: x, # 直接把forward计算的loss传出来
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
#! -*- coding:utf-8 -*-
# 混合精度训练示例,测试中显存占用降低了15%
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
use_amp=True, # True表示使用梯度累积
metrics=['accuracy'],
)
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
#! -*- coding:utf-8 -*-
# DP示例,这里是把loss放在模型里计算的话,则可以部分缓解负载不均衡的问题
import os
# 也可命令行传入
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel, BaseModelDP
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 定义bert上的模型结构,这里loss并不是放在模型里计算的
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, token_ids, segment_ids, labels):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
loss = self.loss_fn(output, labels)
return loss
model = Model().to(device)
model = BaseModelDP(model) # 指定DP模型使用多gpu
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=lambda x, _: x.mean(), # 多个gpu计算的loss的均值
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=10)
#! -*- coding:utf-8 -*-
# DDP示例
# 启动命令:python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 task_distributed_data_parallel.py
import os
# 也可命令行传入
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModelDDP
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset, seed_everything
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=-1)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
device = torch.device('cuda', args.local_rank)
torch.distributed.init_process_group(backend='nccl')
# 模型设置
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
# 固定seed
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids, batch_labels.flatten()], None
# 加载数据集
train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
train_sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构,这里loss并不是放在模型里计算的
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, token_ids, segment_ids, labels):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
loss = self.loss_fn(output, labels)
return loss
model = Model().to(device)
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=lambda x, _: x, # 直接把forward计算的loss传出来
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
if __name__ == '__main__':
model.fit(train_dataloader, epochs=20, steps_per_epoch=None)
#! -*- coding:utf-8 -*-
# 通过R-Drop增强模型的泛化性能
# 官方项目:https://github.com/dropreg/R-Drop
# 数据集:情感分类数据集
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything, text_segmentate, get_pool_emb
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import RDropLoss
from tqdm import tqdm
import torch.nn.functional as F
maxlen = 256
batch_size = 16
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
for _ in range(2):
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert= build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, dropout_rate=0.3, segment_vocab_size=0, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
model.compile(loss=RDropLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=['accuracy'])
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 通过TemporalEnsembling提升模型泛化
# 官方项目:https://github.com/s-laine/tempens
# pytorch第三方实现:https://github.com/ferretj/temporal-ensembling
# 数据集:情感分类数据集
# 本示例是把监督数据当成无监督数据使用
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, seed_everything, text_segmentate, get_pool_emb
from bert4torch.tokenizers import Tokenizer
from bert4torch.losses import TemporalEnsemblingLoss
maxlen = 256
batch_size = 16
epochs = 10
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集,训练数据集shuffle=False
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert= build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0, with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class MyLoss(TemporalEnsemblingLoss):
def forward(self, y_pred, y_true):
# 监督数据当成无监督数据使用,真实场景中可以用大量的无监督数据来使用
y_pred_sup, y_pred_unsup, y_true_sup = y_pred, y_pred, y_true
return super().forward(y_pred_sup, y_pred_unsup, y_true_sup, model.epoch, model.bti)
loss = MyLoss(epochs=epochs, max_batch_num=None)
model.compile(loss=loss, optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=['accuracy'])
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=epochs, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 以文本分类(情感分类)为例的半监督学习UDA策略,https://arxiv.org/abs/1904.12848
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
from bert4torch.losses import UDALoss
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import random
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset = [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
def collate_fn(batch):
def add_noise(token_ids, del_ratio=0.3):
'''这里用随机删除做简单示例,实际中可以使用增删改等多种noise方案
'''
n = len(token_ids)
keep_or_not = np.random.rand(n) > del_ratio
if sum(keep_or_not) == 0:
keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
return list(np.array(token_ids)[keep_or_not])
# batch_token_ids包含三部分,第一部分是有监督数据,第二部分是领域类的无监督数据,第三部分是无监督数据经数据增强后的数据
batch_token_ids, batch_labels = [[], [], []], []
for text, label in batch:
token_ids, _ = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids[0].append(token_ids)
batch_labels.append([label])
# 无监督部分
unsup_text = random.choice(unsup_dataset) # 随机挑一个无监督数据
token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
batch_token_ids[1].append(token_ids)
batch_token_ids[2].append(token_ids[:1] + add_noise(token_ids[1:-1]) + token_ids[-1:]) # 无监督数据增强
batch_token_ids = [j for i in batch_token_ids for j in i]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class Loss(UDALoss):
def forward(self, y_pred, y_true_sup):
loss, loss_sup, loss_unsup = super().forward(y_pred, y_true_sup, model.global_step, model.total_steps)
return {'loss': loss, 'loss_sup': loss_sup, 'loss_unsup': loss_unsup}
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=Loss(tsa_schedule='linear_schedule', start_p=0.8), # 这里可换用不同的策略, 不为None时候要给定model
optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['loss_sup', 'loss_unsup'] # Loss返回的key会自动计入metrics,下述metrics不写仍可以打印具体的Loss
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for token_ids, y_true in data:
y_pred = model.predict(token_ids[:y_true.size(0)]).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 通过对抗训/梯度惩罚练增强模型的泛化性能,包含fgm, pgs, vat,梯度惩罚
# 数据集:情感分类数据集
# 对抗训练:https://kexue.fm/archives/7234
# 虚拟对抗训练:https://kexue.fm/archives/7466
# 梯度惩罚:https://kexue.fm/archives/7234
from bert4torch.models import build_transformer_model, BaseModel
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.snippets import sequence_padding, Callback, ListDataset, text_segmentate, get_pool_emb, seed_everything
from bert4torch.tokenizers import Tokenizer
import sys
maxlen = 256
batch_size = 16
# BERT base
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 传参方式
mode = sys.argv[1]
adversarial_train = {'name': mode}
print(f'Using {mode}'.center(60, '='))
# debug方式
# 具体参数设置可以到bert4torch.models/bert4torch.snippets里
# adversarial_train = {'name': 'fgm'} # fgm方式
# adversarial_train = {'name': 'pgd'} # pgd方式
# adversarial_train = {'name': 'gradient_penalty'} # 梯度惩罚
# adversarial_train = {'name': 'vat'} # 虚拟对抗,这里仅为使用有监督数据的示例
model.compile(loss=nn.CrossEntropyLoss(), optimizer=optim.Adam(model.parameters(), lr=2e-5),
metrics=['accuracy'], adversarial_train=adversarial_train)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 指数滑动平均
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
from bert4torch.optimizers import extend_with_exponential_moving_average
import torch.nn as nn
import torch
import torch.optim as optim
import random, os, numpy as np
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
ema_schedule = extend_with_exponential_moving_average(model, decay=0.99)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
scheduler=ema_schedule,
metrics=['accuracy']
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
ema_schedule.apply_ema_weights() # 使用滑动平均的ema权重
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
ema_schedule.restore_raw_weights() # 恢复原来模型的参数
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 情感分类任务, 加载bert权重
# Mixup策略,包含embedding,hidden, encoder的mixup
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.layers import MixUp
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
choice = 'train' # train表示训练,infer表示推理
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_labels = [], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids.append(token_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data']), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data']), batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data']), batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, mixup_method='encoder', pool_method='cls') -> None:
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
self.mixup = MixUp(method=mixup_method)
def forward(self, token_ids):
hidden_states, pooling = self.mixup.encode(self.bert, [token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
y_pred = self.dense(output)
return y_pred
def predict(self, token_ids):
self.eval()
with torch.no_grad():
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids.gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
y_pred = self.dense(output)
return y_pred
model = Model().to(device)
class Loss(nn.Module):
def forward(self, y_pred, y_true):
return model.mixup(nn.CrossEntropyLoss(), y_pred, y_true)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=Loss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
if choice == 'train':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding:utf-8 -*-
# 以文本分类为例的半监督学习,虚拟对抗训练策略
# 监督数据部分只计算监督Loss, 有监督+无监督数据计算对抗训练的Loss
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, Callback, text_segmentate, ListDataset, seed_everything, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import random
maxlen = 256
batch_size = 16
config_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/pytorch_model.bin'
dict_path = 'F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seed_everything(42)
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
train_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data'])
valid_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.valid.data'])
test_dataset = MyDataset(['F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.test.data'])
# 理论上应该收集任务领域类的无监督数据,这里用所有的监督数据来作无监督数据
unsup_dataset = [sen for sen, _ in (train_dataset.data + valid_dataset.data + test_dataset.data)]
def collate_fn(batch):
# batch_token_ids包含两部部分,第一部分是有监督数据,第二部分是无监督数据
batch_token_ids, batch_labels = [[], []], []
for text, label in batch:
token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
batch_token_ids[0].append(token_ids)
batch_labels.append([label])
# 无监督部分
unsup_text = random.choice(unsup_dataset) # 随机挑一个无监督数据
token_ids, _ = tokenizer.encode(unsup_text, maxlen=maxlen)
batch_token_ids[1].append(token_ids)
batch_token_ids = [j for i in batch_token_ids for j in i]
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return batch_token_ids, batch_labels.flatten()
# 加载数据集
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self, pool_method='cls'):
super().__init__()
self.pool_method = pool_method
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, segment_vocab_size=0)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 2)
def forward(self, token_ids):
hidden_states, pooling = self.bert([token_ids])
pooled_output = get_pool_emb(hidden_states, pooling, token_ids[0].gt(0).long(), self.pool_method)
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
class MyLoss(nn.Module):
def forward(self, y_pred, y_true_sup):
y_pred_sup = y_pred[:y_true_sup.shape[0]] # 仅计算监督部分loss
return F.cross_entropy(y_pred_sup, y_true_sup)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=MyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
adversarial_train = {'name': 'vat', 'adv_alpha': 1} # 虚拟对抗
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
test_acc = self.evaluate(test_dataloader)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
# model.save_weights('best_model.pt')
print(f'val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for inputs, y_true in data:
inputs = [inputs[0][:y_true.size(0)]] # 仅计算有监督部分
y_pred = model.predict(inputs).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
return right / total
if __name__ == '__main__':
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
else:
model.load_weights('best_model.pt')
#! -*- coding: utf-8 -*-
from setuptools import setup, find_packages
setup(
name='bert4torch',
version='0.1.9',
description='an elegant bert4torch',
long_description='bert4torch: https://github.com/Tongjilibo/bert4torch',
license='MIT Licence',
url='https://github.com/Tongjilibo/bert4torch',
author='Tongjilibo',
install_requires=['torch>1.6'],
packages=find_packages()
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment