Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
import pandas as pd
from data import gen_id2corpus
from milvus_util import RecallByMilvus
from paddle_serving_server.pipeline import PipelineClient
def search_in_milvus(text_embedding, query_text):
collection_name = "faq_system"
partition_tag = "partition_1"
client = RecallByMilvus()
start_time = time.time()
status, results = client.search(
collection_name=collection_name, vectors=text_embedding, partition_tag=partition_tag
)
end_time = time.time()
print("Search milvus time cost is {} seconds ".format(end_time - start_time))
corpus_file = "data/qa_pair.csv"
id2corpus = gen_id2corpus(corpus_file)
list_data = []
for line in results:
for item in line:
idx = item.id
distance = item.distance
text = id2corpus[idx]
print(text, distance)
list_data.append([query_text, text, distance])
df = pd.DataFrame(list_data, columns=["query_text", "text", "distance"])
df = df.sort_values(by="distance", ascending=True)
df.to_csv("data/recall_predict.csv", columns=["text", "distance"], sep="\t", header=None, index=False)
if __name__ == "__main__":
client = PipelineClient()
client.connect(["127.0.0.1:8080"])
list_data = ["嘉定区南翔镇实行双门长制“门长”要求落实好哪些工作?"]
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("Extract feature time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
search_in_milvus(result, list_data[0])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python -u evaluate.py \
--similar_text_pair "data/test_pair.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 10
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python export_model.py --params_path checkpoints/model_150/model_state.pdparams --output_path=./output
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "serving_server" \
--client_path "serving_client" \
--fetch_alias_names "output_embedding"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python feature_extract.py \
--model_dir=./output \
--corpus_file "data/corpus.csv"
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# gpu
python -u -m paddle.distributed.launch --gpus "4" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "checkpoints/model_150/model_state.pdparams" \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 64 \
--recall_num 10 \
--similar_text_pair "data/test_pair.csv" \
--corpus_file "data/corpus.csv"
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python -u -m paddle.distributed.launch --gpus '4' \
train.py \
--device gpu \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 50 \
--eval_steps 50 \
--max_seq_length 64 \
--dropout 0.2 \
--output_emb_size 256 \
--dup_rate 0.3 \
--train_set_file "./data/train.csv"
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import random
import time
from functools import partial
import numpy as np
import paddle
from data import convert_example, create_dataloader, read_simcse_text, word_repetition
from model import SimCSE
from scipy import stats
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer, LinearDecayWithWarmup
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=0, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--epochs", default=1, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--save_steps', type=int, default=10000, help="Step interval for saving checkpoint.")
parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override ecpochs.")
parser.add_argument('--eval_steps', type=int, default=10000, help="Step interval for evaluation.")
parser.add_argument("--train_set_file", type=str, required=True, help="The full path of train_set_file.")
parser.add_argument("--margin", default=0.0, type=float, help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--dropout", default=0.1, type=float, help="Dropout for pretrained model encoder.")
parser.add_argument("--dup_rate", default=0.32, type=float, help="duplicate rate for word repetition.")
parser.add_argument("--infer_with_fc_pooler", action='store_true', help="Whether use fc layer after cls embedding or not for when infer.")
args = parser.parse_args()
def set_seed(seed):
"""sets random seed"""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def do_evaluate(model, tokenizer, data_loader, with_pooler=False):
model.eval()
total_num = 0
spearman_corr = 0.0
sims = []
labels = []
for batch in data_loader:
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids, label = batch
total_num += len(label)
query_cls_embedding = model.get_pooled_embedding(
query_input_ids, query_token_type_ids, with_pooler=with_pooler)
title_cls_embedding = model.get_pooled_embedding(title_input_ids, title_token_type_ids, with_pooler=with_pooler)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
sims.append(cosine_sim.numpy())
labels.append(label.numpy())
sims = np.concatenate(sims, axis=0)
labels = np.concatenate(labels, axis=0)
spearman_corr = stats.spearmanr(labels, sims).correlation
model.train()
return spearman_corr, total_num
def do_train():
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
train_ds = load_dataset(
read_simcse_text, data_path=args.train_set_file, lazy=False)
model_name_or_path = 'rocketqa-zh-dureader-query-encoder'
pretrained_model = AutoModel.from_pretrained(model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
trans_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length)
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
)
):
return [data for data in fn(samples)]
train_data_loader = create_dataloader(
train_ds,
mode='train',
batch_size=args.batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_func)
model = SimCSE(
pretrained_model,
margin=args.margin,
scale=args.scale,
output_emb_size=args.output_emb_size)
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)
print("warmup from:{}".format(args.init_from_ckpt))
model = paddle.DataParallel(model)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params)
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
if args.dup_rate > 0.0:
query_input_ids, query_token_type_ids = word_repetition(query_input_ids, query_token_type_ids, args.dup_rate)
title_input_ids, title_token_type_ids = word_repetition(title_input_ids, title_token_type_ids, args.dup_rate)
loss = model(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids)
global_step += 1
if global_step % 10 == 0 and rank == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss,
10 / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
if global_step % args.save_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, 'model_state.pdparams')
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
if args.max_steps > 0 and global_step >= args.max_steps:
return
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, 'model_state.pdparams')
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
if __name__ == "__main__":
do_train()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from milvus_util import VecToMilvus
from tqdm import tqdm
def vector_insert(file_path):
embeddings = np.load(file_path)
print(embeddings.shape)
embedding_ids = [i for i in range(embeddings.shape[0])]
print(len(embedding_ids))
client = VecToMilvus()
collection_name = "faq_system"
partition_tag = "partition_1"
data_size = len(embedding_ids)
batch_size = 100000
for i in tqdm(range(0, data_size, batch_size)):
cur_end = i + batch_size
if cur_end > data_size:
cur_end = data_size
batch_emb = embeddings[np.arange(i, cur_end)]
status, ids = client.insert(
collection_name=collection_name,
vectors=batch_emb.tolist(),
ids=embedding_ids[i : i + batch_size],
partition_tag=partition_tag,
)
if __name__ == "__main__":
file_path = "corpus_embedding.npy"
vector_insert(file_path)
# 无监督检索式问答系统
**目录**
- [无监督检索式问答系统](#无监督检索式问答系统)
- [简介](#简介)
- [项目优势](#项目优势)
- [方案介绍](#方案介绍)
- [流程图](#流程图)
- [技术方案](#技术方案)
- [代码结构说明](#代码结构说明)
- [快速体验](#快速体验)
- [运行环境和安装说明](#运行环境和安装说明)
- [数据说明](#数据说明)
- [快速体验无监督检索式问答系统](#快速体验无监督检索式问答系统)
- [可视化无监督检索式问答系统](#可视化无监督检索式问答系统)
- [离线问答对语料构建](#离线问答对语料构建)
- [基于Pipelines构建问答系统](#基于Pipelines构建问答系统)
- [自定义模型](#自定义模型)
- [数据准备](#数据准备)
- [模型微调](#模型微调)
- [答案抽取](#答案抽取)
- [问题生成](#问题生成)
- [过滤模型](#过滤模型)
- [语义索引和召回模型](#语义索引和召回模型)
- [排序模型](#排序模型)
- [References](#References)
## 简介
问答(QA)系统中最关键的挑战之一是标记数据的稀缺性,这是因为对目标领域获取问答对或常见问答对(FAQ)的成本很高,需要消耗大量的人力和时间。由于上述制约,这导致检索式问答系统落地困难,解决此问题的一种方法是依据问题上下文或大量非结构化文本自动生成的QA问答对。
在此背景下,无监督检索式问答系统(即问答对自动生成智能检索式问答),基于PaddleNLP[问题生成](../../../examples/question_generation/README.md)[UIE](../../../model_zoo/uie/README.md)[检索式问答](../supervised_qa/faq_finance/README.md),支持以非结构化文本形式为上下文自动生成QA问答对,生成的问答对语料可以通过无监督的方式构建检索式问答系统。
若开发者已有FAQ语料,请参考[supervised_qa](../supervised_qa)
### 项目优势
具体来说,本项目具有以下优势:
+ 低成本
+ 可通过自动生成的方式快速大量合成QA语料,大大降低人力成本
+ 可控性好,合成语料和语义检索问答解耦合,可以人工筛查和删除合成的问答对,也可以添加人工标注的问答对
+ 低门槛
+ 手把手搭建无监督检索式问答系统
+ 无需相似Query-Query Pair标注数据也能构建问答系统
+ 效果好
+ 可通过自动问答对生成提升问答对语料覆盖度,缓解中长尾问题覆盖较少的问题
+ 业界领先的检索预训练模型: RocketQA Dual Encoder
+ 针对无标注数据场景的领先解决方案: 检索预训练模型 + 增强的无监督语义索引微调
+ 端到端
+ 提供包括问答语料生成、索引库构建、模型服务部署、WebUI可视化一整套端到端智能问答系统能力
+ 支持对Txt、Word、PDF、Image多源数据上传,同时支持离线、在线QA语料生成和ANN数据库更新
## 方案介绍
<!-- ### 评估指标
**问答对生成**:问答对生成使用的指标是软召回率Recall@K,
**语义索引**:语义索引使用的指标是Recall@K,表示的是预测的前topK(从最后的按得分排序的召回列表中返回前K个结果)结果和语料库中真实的前K个相关结果的重叠率,衡量的是检索系统的查全率。 -->
### 流程图
本项目的流程图如下,对于给定的非结构化文本,我们首先通过答案抽取、问题生成、以及往返过滤模块,得到大量语料相关的问答对。针对这些得到的问答对,用户可以通过可以人工筛查和删除的方式来调整生成的问答对,也可以进一步添加人工标注的问答对。随后开发者就可以通过语义索引模块,来构建向量索引库。在构造完索引库之后,我们就可以通过召回模块和排序模块对问答对进行查询,得到最终的查询结果。
<div align="center">
<img width="700" alt="image" src="https://user-images.githubusercontent.com/20476674/211868709-2ac0932d-c48b-4f87-b1cf-1f2665e5a64e.png">
</div>
### 技术方案
由于涉及较多的模块,本项目将基于PaddleNLP Pipelines进行模块的组合和项目的构建。PaddleNLP Pipelines是一个端到端NLP流水线系统框架,它可以通过插拔式组件产线化设计来构建一个完整的无监督问答系统。具体来说,我们的技术方案包含以下方面:
**答案抽取**:我们基于UIE训练了一个答案抽取模型,该答案抽取模型接收“答案”作为提示词,该模型可以用来对潜在的答案信息进行挖掘抽取,我们同时提供了训练好的模型权重`uie-base-answer-extractor`
**问题生成**:我们基于中文预训练语言模型UNIMO-Text、模版策略和大规模多领域问题生成数据集训练了一个通用点问题生成预训练模型`unimo-text-1.0-question-generation`
**往返过滤**:我们采用过生成(overgenerate)的策略生成大量的潜在答案和问题,并通过往返过滤的方式针对生成的过量问答对进行过滤得到最终的问答对。我们的往返过滤模块需要训练一个有条件抽取式问答模型<sup>3</sup>
**语义索引**:针对给定问答对语料,我们基于RocketQA(即`rocketqa-zh-base-query-encoder`)对问答对进行语义向量化,并通过ElasticSearch的ANN服务构建索引库。
**召回排序**:给定用户查询,我们基于RocketQA的query-encoder和cross-encoder分别进行召回和排序操作,得到目标的问答对,从而返回给用户查询结果。
**Pipelines**:由于本项目设计的模块较多,我们使用PaddleNLP Pipelines进行模块的组合和项目的构建。大体来说,我们的Pipelines包含两个具体的pipeline和三个服务。两个pipeline分别是qa_generation_pipeline和dense_faq_pipeline;三个服务分别是基于ElasticSearch的ANN在线索引库服务,基于RestAPI的模型后端服务以及基于Streamlit的前端WebUI服务。
## 快速体验
### 运行环境和安装说明
基于Pipelines构建问答系统需要安装paddle-pipelines依赖,使用pip安装命令如下:
```bash
# pip一键安装
pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
```
或者进入pipelines目录下,针对源码进行安装:
```bash
# 源码进行安装
cd PaddleNLP/pipelines/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install
```
### 数据说明
我们以提供的纯文本文件[source_file.txt](https://paddlenlp.bj.bcebos.com/applications/unsupervised_qa/source_file.txt)为例,系统将每一条都视为一个上下文并基于此生成多个问答对,并基于此构建索引库,该文件可直接下载放入`data`,开发者也可以使用自己的文件。
### 快速体验无监督检索式问答系统
开发者可以通过如下命令快速体验无监督智能检索问答系统的效果,系统将自动根据提供的纯文本文件构建问答对语料库,并基于生成的问答对语料库构造索引库。
我们建议在GPU环境下运行本示例,运行速度较快,运行命令如下:
```bash
# GPU环境下运行示例
# 设置1个空闲的GPU卡,此处假设0卡为空闲GPU
export CUDA_VISIBLE_DEVICES=0
python run_pipelines_example.py --device gpu --source_file data/source_file.txt --doc_dir data/my_data --index_name faiss_index --retriever_batch_size 16
```
关键参数释义如下:
- `device`: 使用的设备,默认为'gpu',可选择['cpu', 'gpu']。
- `source_file`: 源文件路径,指定该路径将自动为其生成问答对至`doc_dir`
- `doc_dir`: 生成的问答对语料保存的位置,系统将根据该位置自动构建检索数据库,默认为'data/my_data'。
- `index_name`: FAISS的ANN索引名称,默认为'faiss_index'。
- `retriever_batch_size`: 构建ANN索引时的批量大小,默认为16。
如果只有CPU机器,可以通过--device参数指定cpu即可, 运行耗时较长,运行命令如下:
```bash
# CPU环境下运行示例
unset CUDA_VISIBLE_DEVICES
python run_pipelines_example.py --device cpu --source_file data/source_file.txt --doc_dir data/my_data --index_name faiss_index --retriever_batch_size 16
```
## 可视化无监督检索式问答系统
开发者可以基于Pipelines进一步构建Web可视化的无监督检索式问答系统,其效果如下,
<div align="center">
<img src="https://user-images.githubusercontent.com/20476674/199488926-c64d3f4e-8117-475f-afe6-b02088105d09.gif" >
</div>
<!-- ## 基于Paddle-Serving构建问答系统
### 环境依赖
安装方式:`pip install -r requirements.txt` -->
### 离线问答对语料构建
这一部分介绍如何离线构建问答对语料,同时我们我们也在Pipeline中集成了在线问答对语料。
#### 数据说明
我们以提供的纯文本文件[source_file.txt](https://paddlenlp.bj.bcebos.com/applications/unsupervised_qa/source_file.txt)为例,系统将每一条都视为一个上下文并基于此生成多个问答对,随后系统将根据这些问答对构建索引库,该文件可直接下载放入`data`,开发者也可以使用自己的文件。
#### 问答对生成
对于标准场景的问答对可以直接使用提供的预训练模型实现零样本(zero-shot)问答对生成。对于细分场景开发者可以根据个人需求训练[自定义模型](#自定义模型),加载自定义模型进行问答对生成,以进一步提升效果。
生成问答对语料的命令如下:
```shell
export CUDA_VISIBLE_DEVICES=0
python -u run_qa_pairs_generation.py \
--source_file_path=data/source_file.txt \
--target_file_path=data/target_file.json \
--answer_generation_model_path=uie-base-answer-extractor-v1 \
--question_generation_model_path=unimo-text-1.0-question-generation \
--filtration_model_path=uie-base-qa-filter-v1 \
--batch_size=8 \
--a_max_answer_candidates=10 \
--a_prompt='答案' \
--a_position_prob=0.01 \
--q_num_return_sequences=3 \
--q_max_question_length=50 \
--q_decode_strategy=sampling \
--q_top_k=5 \
--q_top_p=1 \
--do_filtration \
--f_filtration_position_prob=0.01 \
--do_debug
```
关键参数释义如下:
- `source_file_path` 源文件路径,源文件中每一行代表一条待生成问答对的上下文文本。
- `target_file_path` 目标文件路径,生成的目标文件为json格式。
- `answer_generation_model_path` 要加载的答案抽取模型的路径,可以是PaddleNLP提供的预训练模型,或者是本地模型checkpoint路径。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。
| 可选预训练模型 |
|---------------------------------|
| uie-base-answer-extractor-v1 |
- `question_generation_model_path` 要加载的问题生成模型的路径,可以是PaddleNLP提供的预训练模型,或者是本地模型checkpoint路径。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。
| 可选预训练模型 |
|---------------------------------|
| unimo-text-1.0-question-generation |
| unimo-text-1.0-dureader_qg |
| unimo-text-1.0-question-generation-dureader_qg |
- `filtration_model_path` 要加载的过滤模型的路径,可以是PaddleNLP提供的预训练模型,或者是本地模型checkpoint路径。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一。
| 可选预训练模型 |
|---------------------------------|
| uie-base-qa-filter-v1 |
- `batch_size` 使用taskflow时的批处理大小,请结合机器情况进行调整,默认为8。
- `a_max_answer_candidates` 答案抽取阶段,每个输入的最大返回答案候选数,默认为5。
- `a_prompt` 答案抽取阶段,使用的提示词,以","分隔,默认为"答案"。
- `a_position_prob` 答案抽取阶段,置信度阈值,默认为0.01。
- `q_num_return_sequences` 问题生成阶段,返回问题候选数,在使用"beam_search"解码策略时它应该小于`q_num_beams`,默认为3。
- `q_max_question_length` 问题生成阶段,最大解码长度,默认为50。
- `q_decode_strategy` 问题生成阶段,解码策略,默认为"sampling"。
- `q_top_k` 问题生成阶段,使用"sampling"解码策略时的top k值,默认为5。
- `q_top_p` 问题生成阶段,使用"sampling"解码策略时的top p值,默认为0。
- `q_num_beams` 问题生成阶段,使用"beam_search"解码策略时的beam大小,默认为6。
- `do_filtration` 是否进行过滤。
- `f_filtration_position_prob` 过滤阶段,过滤置信度阈值,默认为0.1。
- `do_debug` 是否进入调试状态,调试状态下将输出过滤掉的生成问答对。
#### 语料转换
执行以下脚本对生成的问答对进行转换,得到语义索引所需要的语料train.csv、dev.csv、q_corpus.csv、qa_pair.csv:
```shell
python -u run_corpus_preparation.py \
--source_file_path data/target_file.json \
--target_dir_path data/my_corpus
```
关键参数释义如下:
- `source_file_path` 指示了要转换的训练数据集文件或测试数据集文件,文件格式要求见从本地文件创建数据集部分。指示了要转换的问答对json文件路径,生成的目标文件为json格式
- `target_dir_path` 输出数据的目标文件夹,默认为"data/my_corpus"。
- `test_sample_num` 构建检索系统时保留的测试样本数目,默认为0。
- `train_sample_num` 构建检索系统时保留的有监督训练样本数目,默认为0。
- `all_sample_num` 构建检索系统时保留的总样本数目,默认为None,表示保留除了前`test_sample_num`+`train_sample_num`个样本外的所有样本。
<!-- ### 检索模型训练部署
在已有问答语料库和语义检索模型前提下,模型部署首先要把语义检索模型由动态图转换成静态图,然后转换成serving的格式,此外还需要基于Milvus和问答语料库构建语义检索引擎。
关于如何对语义检索模型进行无监督训练,以及针对给定问答语料库进行模型部署,请参考faq_system -->
### 基于Pipelines构建问答系统
本项目提供了基于Pipelines的低成本构建问答对自动生成智能检索问答系统的能力。开发者只需要提供非结构化的纯文本,就可以使用本项目预制的问答对生成模块生成大量的问答对,并基于此快速搭建一个针对自己业务的检索问答系统,并可以提供Web可视化产品服务。Web可视化产品服务支持问答检索、在线问答对生成,在线文件上传和解析,在线索引库更新等功能,用户也可根据需要自行调整。具体的构建流程请参考[Pipelines-无监督智能检索问答系统](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines/examples/unsupervised-question-answering)
## 自定义模型
除了使用预置模型外,用户也可以训练并接入自己训练的模型,我们提供了从答案抽取、问题生成、往返过滤的过滤模型,到语义索引、召回、排序各个阶段的定制化训练方案。
### 数据准备
这一部分介绍如何准备和预处理答案抽取、问题生成、过滤模块微调所需的数据。关于如何准备通过无监督方式训练自定义语义索引模型所需的问答对数据,见[离线问答对语料构建](#离线问答对语料构建)
#### 自定义数据
在许多情况下,我们需要使用本地数据集来微调模型从而得到定制化的能力,让生成的问答对更接近于理想分布,本项目支持使用固定格式本地数据集文件进行微调。
这里我们提供预先标注好的文件样例[train.json](https://paddlenlp.bj.bcebos.com/applications/unsupervised_qa/train.json)[dev.json](https://paddlenlp.bj.bcebos.com/applications/unsupervised_qa/dev.json),开发者可直接下载放入`data`目录,此外也可自行构建本地数据集,具体来说,本地数据集主要包含以下文件:
```text
data
├── train.json # 训练数据集文件
├── dev.json # 开发数据集文件
└── test.json # 可选,待预测数据文件
```
本地数据集文件格式如下:
```text
# train.json/dev.json/test.json文件格式:
{
"context": <context_text>,
"answer": <answer_text>,
"question": <question_text>,
}
...
```
本地数据集文件具体样例如下:
```text
train.json/dev.json/test.json文件样例:
{
"context": "欠条是永久有效的,未约定还款期限的借款合同纠纷,诉讼时效自债权人主张债权之日起计算,时效为2年。 根据《中华人民共和国民法通则》第一百三十五条:向人民法院请求保护民事权利的诉讼时效期间为二年,法律另有规定的除外。 第一百三十七条:诉讼时效期间从知道或者应当知道权利被侵害时起计算。但是,从权利被侵害之日起超过二十年的,人民法院不予保护。有特殊情况的,人民法院可以延长诉讼时效期间。 第六十二条第(四)项:履行期限不明确的,债务人可以随时履行,债权人也可以随时要求履行,但应当给对方必要的准备时间。",
"answer": "永久有效",
"question": "欠条的有效期是多久"
}
...
```
#### 数据预处理
执行以下脚本对数据集进行数据预处理,得到接下来答案抽取、问题生成、过滤模块模型微调所需要的数据,注意这里答案抽取、问题生成、过滤模块的微调数据来源于相同的数据集。
```shell
python -u run_data_preprocess.py \
--source_file_path data/train.json \
--target_dir data/finetune \
--do_answer_prompt
python -u run_data_preprocess.py \
--source_file_path data/dev.json \
--target_dir data/finetune \
--do_answer_prompt
```
关键参数释义如下:
- `source_file_path` 指示了要转换的训练数据集文件或测试数据集文件,文件格式要求见[自定义数据](#自定义数据)部分。
- `target_dir` 输出数据的目标文件夹,默认为"data/finetune"。
- `do_answer_prompt` 表示在构造答案抽取数据时是否添加"答案"提示词。
- `do_len_prompt` 表示在构造答案抽取数据时是否添加长度提示词。
- `do_domain_prompt` 表示在构造答案抽取数据时是否添加领域提示词。
- `domain` 表示添加的领域提示词,在`do_domain_prompt`时有效。
**NOTE:** 预处理后的微调用数据将分别位于`target_dir`下的answer_extraction、question_generation、filtration三个子文件夹中。
### 模型微调
#### 答案抽取
运行如下命令即可在样例训练集上微调答案抽取模型,用户可以选择基于`uie-base-answer-extractor`进行微调,或者基于`uie-base`等从头开始微调。
```shell
# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡
# 例如使用1号和2号卡,则:`--gpu 1,2`
unset CUDA_VISIBLE_DEVICES
python -u -m paddle.distributed.launch --gpus "1,2" --log_dir log/answer_extraction finetune/answer_extraction_and_roundtrip_filtration/finetune.py \
--train_path=data/finetune/answer_extraction/train.json \
--dev_path=data/finetune/answer_extraction/dev.json \
--save_dir=log/answer_extraction/checkpoints \
--learning_rate=1e-5 \
--batch_size=16 \
--max_seq_len=512 \
--num_epochs=30 \
--model=uie-base \
--seed=1000 \
--logging_steps=100 \
--valid_steps=100 \
--device=gpu
```
关键参数释义如下:
- `train_path`: 训练集文件路径。
- `dev_path`: 验证集文件路径。
- `save_dir`: 模型存储路径,默认为`log/answer_extration/checkpoints`
- `learning_rate`: 学习率,默认为1e-5。
- `batch_size`: 批处理大小,请结合机器情况进行调整,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `num_epochs`: 训练轮数,默认为30。
- `model`: 选择模型,程序会基于选择的模型进行模型微调,可选有`uie-base-answer-extractor``uie-base`,`uie-medium`, `uie-mini`, `uie-micro``uie-nano`,默认为`uie-base`
- `init_from_ckpt`: 用于初始化的模型参数的路径。
- `seed`: 随机种子,默认为1000.
- `logging_steps`: 日志打印的间隔steps数,默认10。
- `valid_steps`: evaluate的间隔steps数,默认100。
- `device`: 选用什么设备进行训练,可选cpu或gpu。
通过运行以下命令在样例验证集上进行模型评估:
```shell
python finetune/answer_extraction_and_roundtrip_filtration/evaluate.py \
--model_path=log/answer_extraction/checkpoints/model_best \
--test_path=data/finetune/answer_extraction/dev.json \
--batch_size=16 \
--max_seq_len=512 \
--limit=0.01
```
关键参数释义如下:
- `model_path`: 进行评估的模型文件夹路径,路径下需包含模型权重文件`model_state.pdparams`及配置文件`model_config.json`
- `test_path`: 进行评估的测试集文件。
- `batch_size`: 批处理大小,请结合机器情况进行调整,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `model`: 选择所使用的模型,可选有`uie-base`, `uie-medium`, `uie-mini`, `uie-micro``uie-nano`,默认为`uie-base`
- `debug`: 是否开启debug模式对每个正例类别分别进行评估,该模式仅用于模型调试,默认关闭。
- `limit`: SpanEvaluator测评指标的`limit`,当概率数组中的最后一个维度大于该值时将返回相应的文本片段;当limit设置为0.01时表示关注模型的召回率,也即答案的覆盖率。
#### 问题生成
运行如下命令即可在样例训练集上微调问题生成模型,并在样例验证集上进行验证。
```shell
# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡
# 例如使用1号和2号卡,则:`--gpu 1,2`
unset CUDA_VISIBLE_DEVICES
python -u -m paddle.distributed.launch --gpus "1,2" --log_dir log/question_generation finetune/question_generation/train.py \
--train_file=data/finetune/question_generation/train.json \
--predict_file=data/finetune/question_generation/dev.json \
--save_dir=log/question_generation/checkpoints \
--output_path=log/question_generation/predict.txt \
--dataset_name=dureader_qg \
--model_name_or_path="unimo-text-1.0" \
--logging_steps=100 \
--save_steps=500 \
--epochs=20 \
--batch_size=16 \
--learning_rate=1e-5 \
--warmup_proportion=0.02 \
--weight_decay=0.01 \
--max_seq_len=512 \
--max_target_len=30 \
--do_train \
--do_predict \
--max_dec_len=20 \
--min_dec_len=3 \
--num_return_sequences=1 \
--template=1 \
--device=gpu
```
关键参数释义如下:
- `gpus` 指示了训练所用的GPU,使用多卡训练可以指定多个GPU卡号,例如 --gpus "0,1"。
- `dataset_name` 数据集名称,用来指定数据集格式,默认为`dureader_qg`
- `train_file` 本地训练数据地址,数据格式必须与`dataset_name`所指数据集格式相同,默认为None。
- `predict_file` 本地测试数据地址,数据格式必须与`dataset_name`所指数据集格式相同,默认为None。
- `model_name_or_path` 指示了finetune使用的具体预训练模型,可以是PaddleNLP提供的预训练模型,或者是本地的预训练模型。如果使用本地的预训练模型,可以配置本地模型的目录地址,例如: ./checkpoints/model_xx/,目录中需包含paddle预训练模型model_state.pdparams。如果使用PaddleNLP提供的预训练模型,可以选择下面其中之一,默认为`unimo-text-1.0`
| 可选预训练模型 |
|---------------------------------|
| unimo-text-1.0 |
| unimo-text-1.0-large |
| unimo-text-1.0-question-generation |
- `save_dir` 表示模型的保存路径。
- `output_path` 表示预测结果的保存路径。
- `logging_steps` 表示日志打印间隔。
- `save_steps` 表示模型保存及评估间隔。
- `seed` 表示随机数生成器的种子。
- `epochs` 表示训练轮数。
- `batch_size` 表示每次迭代**每张卡**上的样本数目。
- `learning_rate` 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。
- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
- `warmup_proportion` 表示学习率逐渐升高到基础学习率(即上面配置的learning_rate)所需要的迭代数占总步数的比例。
- `max_seq_len` 模型输入序列的最大长度。
- `max_target_len` 模型训练时标签的最大长度。
- `min_dec_len` 模型生成序列的最小长度。
- `max_dec_len` 模型生成序列的最大长度。
- `do_train` 是否进行训练。
- `do_predict` 是否进行预测,在验证集上会自动评估。
- `device` 表示使用的设备,从gpu和cpu中选择。
- `template` 表示使用的模版,从[0, 1, 2, 3, 4]中选择,0表示不选择模版,1表示使用默认模版。
程序运行时将会自动进行训练和验证,训练过程中会自动保存模型在指定的`save_dir`中。
**【注意】** 如需恢复模型训练,`model_name_or_path`配置本地模型的目录地址即可。
#### 过滤模型
运行如下命令即可在样例训练集上微调答案抽取模型,用户可以选择基于`uie-base-qa-filter`进行微调,或者基于`uie-base`等从头开始微调。
```shell
# GPU启动,参数`--gpus`指定训练所用的GPU卡号,可以是单卡,也可以多卡
# 例如使用1号和2号卡,则:`--gpu 1,2`
unset CUDA_VISIBLE_DEVICES
python -u -m paddle.distributed.launch --gpus "1,2" --log_dir log/filtration finetune/answer_extraction_and_roundtrip_filtration/finetune.py \
--train_path=data/finetune/filtration/train.json \
--dev_path=data/finetune/filtration/dev.json \
--save_dir=log/filtration/checkpoints \
--learning_rate=1e-5 \
--batch_size=16 \
--max_seq_len=512 \
--num_epochs=30 \
--model=uie-base \
--seed=1000 \
--logging_steps=100 \
--valid_steps=100 \
--device=gpu
```
关键参数释义如下:
- `train_path`: 训练集文件路径。
- `dev_path`: 验证集文件路径。
- `save_dir`: 模型存储路径,默认为`log/filtration/checkpoints`
- `learning_rate`: 学习率,默认为1e-5。
- `batch_size`: 批处理大小,请结合机器情况进行调整,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `num_epochs`: 训练轮数,默认为30。
- `model`: 选择模型,程序会基于选择的模型进行模型微调,可选有`uie-base-qa-filter``uie-base`, `uie-medium`, `uie-mini`, `uie-micro``uie-nano`,默认为`uie-base`
- `init_from_ckpt`: 用于初始化的模型参数的路径。
- `seed`: 随机种子,默认为1000.
- `logging_steps`: 日志打印的间隔steps数,默认10。
- `valid_steps`: evaluate的间隔steps数,默认100。
- `device`: 选用什么设备进行训练,可选cpu或gpu。
通过运行以下命令在样例验证集上进行模型评估:
```shell
python finetune/answer_extraction_and_roundtrip_filtration/evaluate.py \
--model_path=log/filtration/checkpoints/model_best \
--test_path=data/finetune/filtration/dev.json \
--batch_size=16 \
--max_seq_len=512 \
--limit=0.5
```
关键参数释义如下:
- `model_path`: 进行评估的模型文件夹路径,路径下需包含模型权重文件`model_state.pdparams`及配置文件`model_config.json`
- `test_path`: 进行评估的测试集文件。
- `batch_size`: 批处理大小,请结合机器情况进行调整,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `model`: 选择所使用的模型,可选有`uie-base`, `uie-medium`, `uie-mini`, `uie-micro``uie-nano`,默认为`uie-base`
- `debug`: 是否开启debug模式对每个正例类别分别进行评估,该模式仅用于模型调试,默认关闭。
- `limit`: SpanEvaluator测评指标的`limit`,当概率数组中的最后一个维度大于该值时将返回相应的文本片段。
#### 语义索引和召回模型
我们的语义索引和召回模型是基于RocketQA的QueryEncoder训练的双塔模型,该模型用于语义索引和召回阶段,分别进行语义向量抽取和相似度召回。除使用预置模型外,如果用户想训练并接入自己的模型,模型训练可以参考[FAQ Finance](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/question_answering/supervised_qa/faq_finance)
#### 排序模型
我们的排序模型是基于RocketQA的CrossEncoder训练的单塔模型,该模型用于搜索的排序阶段,对召回的结果进行重新排序的作用。关于排序的定制训练,可以参考[CrossEncoder](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/ranking/cross_encoder)
## References
[1] Zheng, Chujie, and Minlie Huang. "Exploring prompt-based few-shot learning for grounded dialog generation." arXiv preprint arXiv:2109.06513 (2021).
[2] Li, Wei, et al. "Unimo: Towards unified-modal understanding and generation via cross-modal contrastive learning." arXiv preprint arXiv:2012.15409 (2020).
[3] Puri, Raul, et al. "Training question answering models from synthetic data." arXiv preprint arXiv:2002.09599 (2020).
[4] Lewis, Patrick, et al. "Paq: 65 million probably-asked questions and what you can do with them." Transactions of the Association for Computational Linguistics 9 (2021): 1098-1115.
[5] Alberti, Chris, et al. "Synthetic QA corpora generation with roundtrip consistency." arXiv preprint arXiv:1906.05416 (2019).
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from functools import partial
import paddle
from utils import convert_example, reader, unify_prompt_name
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import UIE, AutoTokenizer
from paddlenlp.utils.log import logger
@paddle.no_grad()
def evaluate(model, metric, data_loader):
"""
Given a dataset, it evals model and computes the metric.
Args:
model(obj:`paddle.nn.Layer`): A model to classify texts.
metric(obj:`paddle.metric.Metric`): The evaluation metric.
data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
"""
model.eval()
metric.reset()
for batch in data_loader:
input_ids, token_type_ids, att_mask, pos_ids, start_ids, end_ids = batch
start_prob, end_prob = model(input_ids, token_type_ids, att_mask, pos_ids)
start_ids = paddle.cast(start_ids, "float32")
end_ids = paddle.cast(end_ids, "float32")
num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
metric.update(num_correct, num_infer, num_label)
precision, recall, f1 = metric.accumulate()
model.train()
return precision, recall, f1
def do_eval():
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
model = UIE.from_pretrained(args.model_path)
test_ds = load_dataset(reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False)
class_dict = {}
if args.debug:
for data in test_ds:
class_name = unify_prompt_name(data["prompt"])
# Only positive examples are evaluated in debug mode
if len(data["result_list"]) != 0:
class_dict.setdefault(class_name, []).append(data)
else:
class_dict["all_classes"] = test_ds
for key in class_dict.keys():
if args.debug:
test_ds = MapDataset(class_dict[key])
else:
test_ds = class_dict[key]
test_ds = test_ds.map(partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len))
test_batch_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False)
test_data_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, return_list=True)
metric = SpanEvaluator(args.limit)
precision, recall, f1 = evaluate(model, metric, test_data_loader)
logger.info("-----------------------------")
logger.info("Class Name: %s" % key)
logger.info("Evaluation Precision: %.5f | Recall: %.5f | F1: %.5f" % (precision, recall, f1))
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default=None, help="The path of saved model that you want to load.")
parser.add_argument("--test_path", type=str, default=None, help="The path of test set.")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size per GPU/CPU for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--debug", action='store_true', help="Precision, recall and F1 score are calculated for each class separately if this option is enabled.")
parser.add_argument("--limit", type=float, default=0.5, help="The limit when using SpanEvaluator, when the last dimension in probability arrays is greater than the limit, the corresponding span will be returned.")
args = parser.parse_args()
# yapf: enable
do_eval()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
from functools import partial
import paddle
from evaluate import evaluate
from utils import convert_example, reader, set_seed
from paddlenlp.datasets import load_dataset
from paddlenlp.metrics import SpanEvaluator
from paddlenlp.transformers import UIE, AutoTokenizer
from paddlenlp.utils.log import logger
def do_train():
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
tokenizer = AutoTokenizer.from_pretrained(args.model)
model = UIE.from_pretrained(args.model)
train_ds = load_dataset(reader, data_path=args.train_path, max_seq_len=args.max_seq_len, lazy=False)
print("train data loaded successfully.")
dev_ds = load_dataset(reader, data_path=args.dev_path, max_seq_len=args.max_seq_len, lazy=False)
print("dev data loaded successfully.")
train_ds = train_ds.map(partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len))
dev_ds = dev_ds.map(partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len))
train_batch_sampler = paddle.io.BatchSampler(dataset=train_ds, batch_size=args.batch_size, shuffle=True)
train_data_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True)
dev_batch_sampler = paddle.io.BatchSampler(dataset=dev_ds, batch_size=args.batch_size, shuffle=False)
dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, return_list=True)
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters())
criterion = paddle.nn.BCELoss()
metric = SpanEvaluator()
loss_list = []
global_step = 0
best_f1 = 0
tic_train = time.time()
for epoch in range(1, args.num_epochs + 1):
for batch in train_data_loader:
input_ids, token_type_ids, att_mask, pos_ids, start_ids, end_ids = batch
start_prob, end_prob = model(input_ids, token_type_ids, att_mask, pos_ids)
start_ids = paddle.cast(start_ids, "float32")
end_ids = paddle.cast(end_ids, "float32")
loss_start = criterion(start_prob, start_ids)
loss_end = criterion(end_prob, end_ids)
loss = (loss_start + loss_end) / 2.0
loss.backward()
optimizer.step()
optimizer.clear_grad()
loss_list.append(float(loss))
global_step += 1
if global_step % args.logging_steps == 0 and rank == 0:
time_diff = time.time() - tic_train
loss_avg = sum(loss_list) / len(loss_list)
logger.info(
"global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, loss_avg, args.logging_steps / time_diff)
)
tic_train = time.time()
if global_step % args.valid_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
model_to_save.save_pretrained(save_dir)
logger.disable()
tokenizer.save_pretrained(save_dir)
logger.enable()
precision, recall, f1 = evaluate(model, metric, dev_data_loader)
logger.info("Evaluation precision: %.5f, recall: %.5f, F1: %.5f" % (precision, recall, f1))
if f1 > best_f1:
logger.info(f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}")
best_f1 = f1
save_dir = os.path.join(args.save_dir, "model_best")
model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
model_to_save.save_pretrained(save_dir)
logger.disable()
tokenizer.save_pretrained(save_dir)
logger.enable()
tic_train = time.time()
if __name__ == "__main__":
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=16, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--train_path", default=None, type=str, help="The path of train set.")
parser.add_argument("--dev_path", default=None, type=str, help="The path of dev set.")
parser.add_argument("--save_dir", default='.log/filtration/checkpoints', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_len", default=512, type=int, help="The maximum input sequence length. Sequences longer than this will be split automatically.")
parser.add_argument("--num_epochs", default=100, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--seed", default=1000, type=int, help="Random seed for initialization")
parser.add_argument("--logging_steps", default=10, type=int, help="The interval steps to logging.")
parser.add_argument("--valid_steps", default=100, type=int, help="The interval steps to evaluate model performance.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--model", choices=["uie-base", "uie-tiny", "uie-medium", "uie-mini", "uie-micro", "uie-nano"], default="uie-base", type=str, help="Select the pretrained model for few-shot learning.")
parser.add_argument("--init_from_ckpt", default=None, type=str, help="The path of model parameters for initialization.")
args = parser.parse_args()
# yapf: enable
do_train()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.metric import Metric
from paddlenlp.utils.tools import get_bool_ids_greater_than, get_span
class SpanEvaluator(Metric):
"""
SpanEvaluator computes the precision, recall and F1-score for span detection.
"""
def __init__(self, limit=0.5):
super(SpanEvaluator, self).__init__()
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
self.limit = limit
def compute(self, start_probs, end_probs, gold_start_ids, gold_end_ids):
"""
Computes the precision, recall and F1-score for span detection.
"""
pred_start_ids = get_bool_ids_greater_than(start_probs, self.limit)
pred_end_ids = get_bool_ids_greater_than(end_probs, self.limit)
gold_start_ids = get_bool_ids_greater_than(gold_start_ids.tolist(), self.limit)
gold_end_ids = get_bool_ids_greater_than(gold_end_ids.tolist(), self.limit)
num_correct_spans = 0
num_infer_spans = 0
num_label_spans = 0
for predict_start_ids, predict_end_ids, label_start_ids, label_end_ids in zip(
pred_start_ids, pred_end_ids, gold_start_ids, gold_end_ids
):
[_correct, _infer, _label] = self.eval_span(
predict_start_ids, predict_end_ids, label_start_ids, label_end_ids
)
num_correct_spans += _correct
num_infer_spans += _infer
num_label_spans += _label
return num_correct_spans, num_infer_spans, num_label_spans
def update(self, num_correct_spans, num_infer_spans, num_label_spans):
"""
This function takes (num_infer_spans, num_label_spans, num_correct_spans) as input,
to accumulate and update the corresponding status of the SpanEvaluator object.
"""
self.num_infer_spans += num_infer_spans
self.num_label_spans += num_label_spans
self.num_correct_spans += num_correct_spans
def eval_span(self, predict_start_ids, predict_end_ids, label_start_ids, label_end_ids):
"""
evaluate position extraction (start, end)
return num_correct, num_infer, num_label
input: [1, 2, 10] [4, 12] [2, 10] [4, 11]
output: (1, 2, 2)
"""
pred_set = get_span(predict_start_ids, predict_end_ids)
label_set = get_span(label_start_ids, label_end_ids)
num_correct = len(pred_set & label_set)
num_infer = len(pred_set)
# For the case of overlapping in the same category,
# length of label_start_ids and label_end_ids is not equal
num_label = max(len(label_start_ids), len(label_end_ids))
return (num_correct, num_infer, num_label)
def accumulate(self):
"""
This function returns the mean precision, recall and f1 score for all accumulated minibatches.
Returns:
tuple: Returns tuple (`precision, recall, f1 score`).
"""
precision = float(self.num_correct_spans / self.num_infer_spans) if self.num_infer_spans else 0.0
recall = float(self.num_correct_spans / self.num_label_spans) if self.num_label_spans else 0.0
f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_spans else 0.0
return precision, recall, f1_score
def reset(self):
"""
Reset function empties the evaluation memory for previous mini-batches.
"""
self.num_infer_spans = 0
self.num_label_spans = 0
self.num_correct_spans = 0
def name(self):
"""
Return name of metric instance.
"""
return "precision", "recall", "f1"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import math
import random
import re
import numpy as np
import paddle
from tqdm import tqdm
from paddlenlp.utils.log import logger
def set_seed(seed):
paddle.seed(seed)
random.seed(seed)
np.random.seed(seed)
def convert_example(example, tokenizer, max_seq_len):
"""
example: {
title
prompt
content
result_list
}
"""
encoded_inputs = tokenizer(
text=[example["prompt"]],
text_pair=[example["content"]],
truncation=True,
max_seq_len=max_seq_len,
pad_to_max_seq_len=True,
return_attention_mask=True,
return_position_ids=True,
return_dict=False,
return_offsets_mapping=True,
)
encoded_inputs = encoded_inputs[0]
offset_mapping = [list(x) for x in encoded_inputs["offset_mapping"]]
bias = 0
for index in range(1, len(offset_mapping)):
mapping = offset_mapping[index]
if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
bias = offset_mapping[index - 1][1] + 1 # Includes [SEP] token
if mapping[0] == 0 and mapping[1] == 0:
continue
offset_mapping[index][0] += bias
offset_mapping[index][1] += bias
start_ids = [0 for x in range(max_seq_len)]
end_ids = [0 for x in range(max_seq_len)]
for item in example["result_list"]:
start = map_offset(item["start"] + bias, offset_mapping)
end = map_offset(item["end"] - 1 + bias, offset_mapping)
start_ids[start] = 1.0
end_ids[end] = 1.0
tokenized_output = [
encoded_inputs["input_ids"],
encoded_inputs["token_type_ids"],
encoded_inputs["position_ids"],
encoded_inputs["attention_mask"],
start_ids,
end_ids,
]
tokenized_output = [np.array(x, dtype="int64") for x in tokenized_output]
return tuple(tokenized_output)
def map_offset(ori_offset, offset_mapping):
"""
map ori offset to token offset
"""
for index, span in enumerate(offset_mapping):
if span[0] <= ori_offset < span[1]:
return index
return -1
def reader(data_path, max_seq_len=512):
"""
read json
"""
with open(data_path, "r", encoding="utf-8") as f:
i = 0
j = 0
for line in f:
json_line = json.loads(line)
content = json_line["content"].strip()
prompt = json_line["prompt"]
# Model Input is aslike: [CLS] Prompt [SEP] Content [SEP]
# It include three summary tokens.
if max_seq_len <= len(prompt) + 3:
raise ValueError("The value of max_seq_len is too small, please set a larger value")
max_content_len = max_seq_len - len(prompt) - 3
if len(content) <= max_content_len:
i += 1
yield json_line
else:
j += 1
result_list = json_line["result_list"]
json_lines = []
accumulate = 0
while True:
cur_result_list = []
for result in result_list:
if result["start"] + 1 <= max_content_len < result["end"]:
max_content_len = result["start"]
break
cur_content = content[:max_content_len]
res_content = content[max_content_len:]
while True:
if len(result_list) == 0:
break
elif result_list[0]["end"] <= max_content_len:
if result_list[0]["end"] > 0:
cur_result = result_list.pop(0)
cur_result_list.append(cur_result)
else:
cur_result_list = [result for result in result_list]
break
else:
break
json_line = {"content": cur_content, "result_list": cur_result_list, "prompt": prompt}
json_lines.append(json_line)
for result in result_list:
if result["end"] <= 0:
break
result["start"] -= max_content_len
result["end"] -= max_content_len
accumulate += max_content_len
max_content_len = max_seq_len - len(prompt) - 3
if len(res_content) == 0:
break
elif len(res_content) < max_content_len:
json_line = {"content": res_content, "result_list": result_list, "prompt": prompt}
json_lines.append(json_line)
break
else:
content = res_content
for json_line in json_lines:
yield json_line
def unify_prompt_name(prompt):
# The classification labels are shuffled during finetuning, so they need
# to be unified during evaluation.
if re.search(r"\[.*?\]$", prompt):
prompt_prefix = prompt[: prompt.find("[", 1)]
cls_options = re.search(r"\[.*?\]$", prompt).group()[1:-1].split(",")
cls_options = sorted(list(set(cls_options)))
cls_options = ",".join(cls_options)
prompt = prompt_prefix + "[" + cls_options + "]"
return prompt
return prompt
def add_negative_example(examples, texts, prompts, label_set, negative_ratio):
negative_examples = []
positive_examples = []
with tqdm(total=len(prompts)) as pbar:
for i, prompt in enumerate(prompts):
redundants_list = list(set(label_set) ^ set(prompt))
redundants_list.sort()
num_positive = len(examples[i])
if num_positive != 0:
actual_ratio = math.ceil(len(redundants_list) / num_positive)
else:
# Set num_positive to 1 for text without positive example
num_positive, actual_ratio = 1, 0
if actual_ratio <= negative_ratio or negative_ratio == -1:
idxs = [k for k in range(len(redundants_list))]
else:
idxs = random.sample(range(0, len(redundants_list)), negative_ratio * num_positive)
for idx in idxs:
negative_result = {"content": texts[i], "result_list": [], "prompt": redundants_list[idx]}
negative_examples.append(negative_result)
positive_examples.extend(examples[i])
pbar.update(1)
return positive_examples, negative_examples
def add_full_negative_example(examples, texts, relation_prompts, predicate_set, subject_goldens):
with tqdm(total=len(relation_prompts)) as pbar:
for i, relation_prompt in enumerate(relation_prompts):
negative_sample = []
for subject in subject_goldens[i]:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = subject + "的" + predicate
if prompt not in relation_prompt:
negative_result = {"content": texts[i], "result_list": [], "prompt": prompt}
negative_sample.append(negative_result)
examples[i].extend(negative_sample)
pbar.update(1)
return examples
def construct_relation_prompt_set(entity_name_set, predicate_set):
relation_prompt_set = set()
for entity_name in entity_name_set:
for predicate in predicate_set:
# The relation prompt is constructed as follows:
# subject + "的" + predicate
relation_prompt = entity_name + "的" + predicate
relation_prompt_set.add(relation_prompt)
return sorted(list(relation_prompt_set))
def generate_cls_example(text, labels, prompt_prefix, options):
random.shuffle(options)
cls_options = ",".join(options)
prompt = prompt_prefix + "[" + cls_options + "]"
result_list = []
example = {"content": text, "result_list": result_list, "prompt": prompt}
for label in labels:
start = prompt.rfind(label[0]) - len(prompt) - 1
end = start + len(label)
result = {"text": label, "start": start, "end": end}
example["result_list"].append(result)
return example
def convert_cls_examples(raw_examples, prompt_prefix="情感倾向", options=["正向", "负向"]):
"""
Convert labeled data export from doccano for classification task.
"""
examples = []
logger.info("Converting doccano data...")
with tqdm(total=len(raw_examples)):
for line in raw_examples:
items = json.loads(line)
# Compatible with doccano >= 1.6.2
if "data" in items.keys():
text, labels = items["data"], items["label"]
else:
text, labels = items["text"], items["label"]
example = generate_cls_example(text, labels, prompt_prefix, options)
examples.append(example)
return examples
def convert_ext_examples(
raw_examples, negative_ratio, prompt_prefix="情感倾向", options=["正向", "负向"], separator="##", is_train=True
):
"""
Convert labeled data export from doccano for extraction and aspect-level classification task.
"""
def _sep_cls_label(label, separator):
label_list = label.split(separator)
if len(label_list) == 1:
return label_list[0], None
return label_list[0], label_list[1:]
def _concat_examples(positive_examples, negative_examples, negative_ratio):
examples = []
if math.ceil(len(negative_examples) / len(positive_examples)) <= negative_ratio:
examples = positive_examples + negative_examples
else:
# Random sampling the negative examples to ensure overall negative ratio unchanged.
idxs = random.sample(range(0, len(negative_examples)), negative_ratio * len(positive_examples))
negative_examples_sampled = []
for idx in idxs:
negative_examples_sampled.append(negative_examples[idx])
examples = positive_examples + negative_examples_sampled
return examples
texts = []
entity_examples = []
relation_examples = []
entity_cls_examples = []
entity_prompts = []
relation_prompts = []
entity_label_set = []
entity_name_set = []
predicate_set = []
subject_goldens = []
logger.info("Converting doccano data...")
with tqdm(total=len(raw_examples)) as pbar:
for line in raw_examples:
items = json.loads(line)
entity_id = 0
if "data" in items.keys():
relation_mode = False
if isinstance(items["label"], dict) and "entities" in items["label"].keys():
relation_mode = True
text = items["data"]
entities = []
relations = []
if not relation_mode:
# Export file in JSONL format which doccano < 1.7.0
# e.g. {"data": "", "label": [ [0, 2, "ORG"], ... ]}
for item in items["label"]:
entity = {"id": entity_id, "start_offset": item[0], "end_offset": item[1], "label": item[2]}
entities.append(entity)
entity_id += 1
else:
# Export file in JSONL format for relation labeling task which doccano < 1.7.0
# e.g. {"data": "", "label": {"relations": [ {"id": 0, "start_offset": 0, "end_offset": 6, "label": "ORG"}, ... ], "entities": [ {"id": 0, "from_id": 0, "to_id": 1, "type": "foundedAt"}, ... ]}}
entities.extend([entity for entity in items["label"]["entities"]])
if "relations" in items["label"].keys():
relations.extend([relation for relation in items["label"]["relations"]])
else:
# Export file in JSONL format which doccano >= 1.7.0
# e.g. {"text": "", "label": [ [0, 2, "ORG"], ... ]}
if "label" in items.keys():
text = items["text"]
entities = []
for item in items["label"]:
entity = {"id": entity_id, "start_offset": item[0], "end_offset": item[1], "label": item[2]}
entities.append(entity)
entity_id += 1
relations = []
else:
# Export file in JSONL (relation) format
# e.g. {"text": "", "relations": [ {"id": 0, "start_offset": 0, "end_offset": 6, "label": "ORG"}, ... ], "entities": [ {"id": 0, "from_id": 0, "to_id": 1, "type": "foundedAt"}, ... ]}
text, relations, entities = items["text"], items["relations"], items["entities"]
texts.append(text)
entity_example = []
entity_prompt = []
entity_example_map = {}
entity_map = {} # id to entity name
for entity in entities:
entity_name = text[entity["start_offset"] : entity["end_offset"]]
entity_map[entity["id"]] = {
"name": entity_name,
"start": entity["start_offset"],
"end": entity["end_offset"],
}
entity_label, entity_cls_label = _sep_cls_label(entity["label"], separator)
# Define the prompt prefix for entity-level classification
entity_cls_prompt_prefix = entity_name + "的" + prompt_prefix
if entity_cls_label is not None:
entity_cls_example = generate_cls_example(
text, entity_cls_label, entity_cls_prompt_prefix, options
)
entity_cls_examples.append(entity_cls_example)
result = {"text": entity_name, "start": entity["start_offset"], "end": entity["end_offset"]}
if entity_label not in entity_example_map.keys():
entity_example_map[entity_label] = {
"content": text,
"result_list": [result],
"prompt": entity_label,
}
else:
entity_example_map[entity_label]["result_list"].append(result)
if entity_label not in entity_label_set:
entity_label_set.append(entity_label)
if entity_name not in entity_name_set:
entity_name_set.append(entity_name)
entity_prompt.append(entity_label)
for v in entity_example_map.values():
entity_example.append(v)
entity_examples.append(entity_example)
entity_prompts.append(entity_prompt)
subject_golden = [] # Golden entity inputs
relation_example = []
relation_prompt = []
relation_example_map = {}
for relation in relations:
predicate = relation["type"]
subject_id = relation["from_id"]
object_id = relation["to_id"]
# The relation prompt is constructed as follows:
# subject + "的" + predicate
prompt = entity_map[subject_id]["name"] + "的" + predicate
if entity_map[subject_id]["name"] not in subject_golden:
subject_golden.append(entity_map[subject_id]["name"])
result = {
"text": entity_map[object_id]["name"],
"start": entity_map[object_id]["start"],
"end": entity_map[object_id]["end"],
}
if prompt not in relation_example_map.keys():
relation_example_map[prompt] = {"content": text, "result_list": [result], "prompt": prompt}
else:
relation_example_map[prompt]["result_list"].append(result)
if predicate not in predicate_set:
predicate_set.append(predicate)
relation_prompt.append(prompt)
for v in relation_example_map.values():
relation_example.append(v)
relation_examples.append(relation_example)
relation_prompts.append(relation_prompt)
subject_goldens.append(subject_golden)
pbar.update(1)
logger.info("Adding negative samples for first stage prompt...")
positive_examples, negative_examples = add_negative_example(
entity_examples, texts, entity_prompts, entity_label_set, negative_ratio
)
if len(positive_examples) == 0:
all_entity_examples = []
elif is_train:
all_entity_examples = _concat_examples(positive_examples, negative_examples, negative_ratio)
else:
all_entity_examples = positive_examples + negative_examples
all_relation_examples = []
if len(predicate_set) != 0:
if is_train:
logger.info("Adding negative samples for second stage prompt...")
relation_prompt_set = construct_relation_prompt_set(entity_name_set, predicate_set)
positive_examples, negative_examples = add_negative_example(
relation_examples, texts, relation_prompts, relation_prompt_set, negative_ratio
)
all_relation_examples = _concat_examples(positive_examples, negative_examples, negative_ratio)
else:
logger.info("Adding negative samples for second stage prompt...")
relation_examples = add_full_negative_example(
relation_examples, texts, relation_prompts, predicate_set, subject_goldens
)
all_relation_examples = [r for relation_example in relation_examples for r in relation_example]
return all_entity_examples, all_relation_examples, entity_cls_examples
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from functools import partial
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
from paddlenlp.data import Pad
def print_args(args):
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def set_seed(seed):
# Use the same data seed(for data shuffle) for all procs to guarantee data
# consistency after sharding.
random.seed(seed)
np.random.seed(seed)
# Maybe different op seeds(for dropout) for different procs is better.
paddle.seed(seed + dist.get_rank())
def convert_example(
example, tokenizer, max_seq_len=512, max_target_len=128, max_title_len=256, mode="train", template=0
):
"""Convert all examples into necessary features."""
if mode == "pretrain" or mode == "pretrain_test":
context = example["context"]
answer = example["answer"]
target = example["target"]
source = "答案:" + answer + tokenizer.sep_token + "上下文:" + context
title = None
elif mode == "train" or mode == "test":
target = None
title = None
if "source" in example and "title" in example:
source = example["source"]
if "title" in example.keys():
title = example["title"]
elif "context" in example and "answer" in example:
source = example["context"]
if "answer" in example.keys():
title = example["answer"]
else:
assert False, "Source and title are not in the input dictionary, nor are context and answer."
if "target" in example.keys():
target = example["target"]
elif "question" in example.keys():
target = example["question"]
if template == 1:
source = "答案:" + title + tokenizer.sep_token + "上下文:" + source
title = None
if target:
target = "问题:" + target
elif template == 2:
source = "答案:" + title + tokenizer.sep_token + "上下文:" + source
title = None
if target:
target = "在已知答案的前提下,问题:" + target
elif template == 3:
source = "这是一个问题生成任务,根据提供的答案和上下文,来生成问题。" + title + tokenizer.sep_token + "上下文:" + source
title = None
if target:
target = "问题:" + target
elif template == 4:
prompt_common = example["prompt_common"]
prompt_domain = example["prompt_domain"]
source = (
prompt_common
+ " "
+ tokenizer.sep_token
+ " "
+ "".join(
[" " + tokenizer.cls_token + " " + one + " " + tokenizer.sep_token + " " for one in prompt_domain]
)
+ " "
+ tokenizer.cls_token
+ " "
+ "答案:"
+ title
+ " "
+ tokenizer.sep_token
+ " "
+ tokenizer.cls_token
+ "上下文:"
+ source
)
title = None
if target:
target = "问题:" + target
if mode == "train" or mode == "pretrain":
tokenized_example = tokenizer.gen_encode(
source,
title=title,
target=target,
max_seq_len=max_seq_len,
max_target_len=max_target_len,
max_title_len=max_title_len,
return_position_ids=True,
return_length=True,
)
temp_tokens = tokenizer.convert_ids_to_tokens(tokenized_example["input_ids"])
index_list = []
count = tokenized_example["input_ids"].count(tokenizer.cls_token_id)
# If template==4, count must be equal to 7, otherwise count must be equal to 2
assert count == 7 or count == 2, (
str(count) + " is not in [2, 7], temp_tokens: " + " ".join(temp_tokens) + "source: " + source
)
index = -1
for i in range(0, count):
index = tokenized_example["input_ids"].index(tokenizer.cls_token_id, index + 1)
index_list.append(index)
if template == 4:
tokenized_example["token_type_ids"] = (
[2] * (index_list[1] - index_list[0])
+ [3] * (index_list[4] - index_list[1])
+ [0] * (index_list[6] - index_list[4])
+ [1] * (len(tokenized_example["input_ids"]) - index_list[6])
)
target_start = index_list[-1]
target_end = tokenized_example["seq_len"]
# Use to gather the logits corresponding to the labels during training
tokenized_example["masked_positions"] = list(range(target_start, target_end - 1))
tokenized_example["labels"] = tokenized_example["input_ids"][target_start + 1 : target_end]
if template == 4:
tokenized_example["token_type_ids"]
return tokenized_example
elif mode == "test" or mode == "pretrain_test":
tokenized_example = tokenizer.gen_encode(
source,
title=title,
max_seq_len=max_seq_len,
max_title_len=max_title_len,
add_start_token_for_decoding=True,
return_position_ids=True,
)
if template == 4:
# temp_tokens = tokenizer.convert_ids_to_tokens(tokenized_example['input_ids'])
index_list = []
count = tokenized_example["input_ids"].count(tokenizer.cls_token_id)
assert count == 7, str(count) + " is not in [7]"
index = -1
for i in range(0, count):
index = tokenized_example["input_ids"].index(tokenizer.cls_token_id, index + 1)
index_list.append(index)
tokenized_example["token_type_ids"] = (
[2] * (index_list[1] - index_list[0])
+ [3] * (index_list[4] - index_list[1])
+ [0] * (index_list[6] - index_list[4])
+ [1] * (len(tokenized_example["input_ids"]) - index_list[6])
)
assert ("target" in example and example["target"]) or ("question" in example and example["question"]), example
if "target" in example and example["target"]:
tokenized_example["target"] = example["target"]
elif "question" in example and example["question"]:
tokenized_example["target"] = example["question"]
return tokenized_example
def batchify_fn(batch_examples, pad_val, mode):
def pad_mask(batch_attention_mask):
batch_size = len(batch_attention_mask)
max_len = max(map(len, batch_attention_mask))
attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e9
for i, mask_data in enumerate(attention_mask):
seq_len = len(batch_attention_mask[i])
mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
# In order to ensure the correct broadcasting mechanism, expand one
# dimension to the second dimension (n_head of Transformer).
attention_mask = np.expand_dims(attention_mask, axis=1)
return attention_mask
pad_func = Pad(pad_val=pad_val, pad_right=False, dtype="int64")
input_ids = pad_func([example["input_ids"] for example in batch_examples])
token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
position_ids = pad_func([example["position_ids"] for example in batch_examples])
attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
if mode == "train" or mode == "pretrain":
max_len = max([example["seq_len"] for example in batch_examples])
masked_positions = np.concatenate(
[
np.array(example["masked_positions"]) + (max_len - example["seq_len"]) + i * max_len
for i, example in enumerate(batch_examples)
]
)
labels = np.concatenate([np.array(example["labels"], dtype="int64") for example in batch_examples])
return input_ids, token_type_ids, position_ids, attention_mask, masked_positions, labels
elif mode == "test" or mode == "pretrain_test":
return input_ids, token_type_ids, position_ids, attention_mask
def create_data_loader(dataset, tokenizer, args, mode):
trans_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len,
max_target_len=args.max_target_len,
max_title_len=args.max_title_len,
mode=mode,
template=args.template,
)
dataset = dataset.map(trans_func, lazy=True)
if mode == "pretrain":
batch_sampler = DistributedBatchSampler(dataset, batch_size=args.batch_size, shuffle=True)
elif mode == "train":
batch_sampler = DistributedBatchSampler(dataset, batch_size=args.batch_size, shuffle=True)
elif mode == "test" or mode == "pretrain_test":
batch_sampler = BatchSampler(dataset, batch_size=args.batch_size // 2, shuffle=False)
collate_fn = partial(batchify_fn, pad_val=tokenizer.pad_token_id, mode=mode)
data_loader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, return_list=True)
return dataset, data_loader
def post_process_sum(token_ids, tokenizer):
"""Post-process the decoded sequence. Truncate from the first <eos>."""
eos_pos = len(token_ids)
for i, tok_id in enumerate(token_ids):
if tok_id == tokenizer.mask_token_id:
eos_pos = i
break
token_ids = token_ids[:eos_pos]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
tokens = tokenizer.merge_subword(tokens)
special_tokens = ["[UNK]"]
tokens = [token for token in tokens if token not in special_tokens]
return token_ids, tokens
def remove_template(instr):
"""Remove template prefix of decoded sequence."""
outstr = instr.strip("问题:")
outstr = outstr.strip("在已知答案的前提下,问题:")
return outstr
def select_sum(ids, scores, tokenizer, max_dec_len=None, num_return_sequences=1):
results = []
group = []
tmp = []
if scores is not None:
ids = ids.numpy()
scores = scores.numpy()
if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
raise ValueError(
"the length of `ids` is {}, but the `num_return_sequences` is {}".format(
len(ids), num_return_sequences
)
)
for pred, score in zip(ids, scores):
pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
num_token = len(pred_token_ids)
target = "".join(pred_tokens)
target = remove_template(target)
# not ending
if max_dec_len is not None and num_token >= max_dec_len:
score -= 1e3
tmp.append([target, score])
if len(tmp) == num_return_sequences:
group.append(tmp)
tmp = []
for preds in group:
preds = sorted(preds, key=lambda x: -x[1])
results.append(preds[0][0])
else:
ids = ids.numpy()
for pred in ids:
pred_token_ids, pred_tokens = post_process_sum(pred, tokenizer)
num_token = len(pred_token_ids)
response = "".join(pred_tokens)
response = remove_template(response)
# TODO: Support return scores in FT.
tmp.append([response])
if len(tmp) == num_return_sequences:
group.append(tmp)
tmp = []
for preds in group:
results.append(preds[0][0])
return results
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import time
import paddle
import paddle.distributed as dist
from gen_utils import create_data_loader, print_args, select_sum, set_seed
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--dataset_name', type=str, default='dureader_qg', help='The name of the dataset to load.')
parser.add_argument('--model_name_or_path', type=str, default='unimo-text-1.0', help='The path or shortcut name of the pre-trained model.')
parser.add_argument("--predict_file", type=str, required=False, default=None, help="Predict data path.")
parser.add_argument('--save_dir', type=str, default='./checkpoints', help='The directory where the checkpoints will be saved.')
parser.add_argument('--logging_steps', type=int, default=100, help='Log every X updates steps.')
parser.add_argument('--seed', type=int, default=1, help='Random seed for initialization.')
parser.add_argument('--batch_size', type=int, default=16, help='Batch size per GPU/CPU for training.')
parser.add_argument('--max_seq_len', type=int, default=512, help='The maximum sequence length of training.')
parser.add_argument('--max_target_len', type=int, default=30, help='The maximum target sequence length of training.')
parser.add_argument('--max_title_len', type=int, default=30, help='The maximum title sequence length of training.')
parser.add_argument('--max_dec_len', type=int, default=20, help='The maximum sequence length of decoding.')
parser.add_argument('--min_dec_len', type=int, default=3, help='The minimal sequence length of decoding.')
parser.add_argument('--num_return_sequences', type=int, default=1, help='The numbers of returned sequences for one input in generation.')
parser.add_argument('--decode_strategy', type=str, default='beam_search', help='The decode strategy in generation.')
parser.add_argument('--top_k', type=int, default=0, help='The number of highest probability vocabulary tokens to keep for top-k sampling.')
parser.add_argument('--temperature', type=float, default=1.0, help='The value used to module the next token probabilities.')
parser.add_argument('--top_p', type=float, default=1.0, help='The cumulative probability for top-p sampling.')
parser.add_argument('--num_beams', type=int, default=6, help='The number of beams for beam search.')
parser.add_argument('--length_penalty', type=float, default=1.2, help='The exponential penalty to the sequence length for beam search.')
parser.add_argument('--device', type=str, default='gpu', help='The device to select for training the model.')
parser.add_argument('--output_path', type=str, default='./predict.txt', help='The file path where the infer result will be saved.')
parser.add_argument("--do_predict", action='store_true', help="Whether to eval and predict.")
parser.add_argument("--template", type=int, default=1, help="The template used during training, select from [0, 1, 2, 3, 4].")
args = parser.parse_args()
return args
# yapf: enable
def read_file(file):
with open(file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip()
if not line:
continue
line = json.loads(line)
yield line
def run(args):
paddle.set_device(args.device)
world_size = dist.get_world_size()
if world_size > 1:
dist.init_parallel_env()
set_seed(args.seed)
model = UNIMOLMHeadModel.from_pretrained(args.model_name_or_path)
tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path)
if world_size > 1:
model = paddle.DataParallel(model)
if args.predict_file:
dev_ds = load_dataset(read_file, file=args.predict_file, lazy=False)
else:
dev_ds = load_dataset(args.dataset_name, splits="dev", data_files=args.predict_file)
dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, "test")
if args.do_predict:
model_eval = model._layers if isinstance(model, paddle.DataParallel) else model
prediction(model_eval, dev_data_loader, args, tokenizer)
@paddle.no_grad()
def prediction(model, data_loader, args, tokenizer):
print("\nPred begin...")
model.eval()
pred_ref = []
time_begin = time.time()
total_time = 0.0
start_time = time.time()
for step, inputs in enumerate(data_loader, 1):
input_ids, token_type_ids, position_ids, attention_mask = inputs
ids, scores = model.generate(
input_ids=input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask,
max_length=args.max_dec_len,
min_length=args.min_dec_len,
decode_strategy=args.decode_strategy,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
num_beams=args.num_beams,
length_penalty=args.length_penalty,
num_return_sequences=args.num_return_sequences,
bos_token_id=tokenizer.cls_token_id,
eos_token_id=tokenizer.mask_token_id,
)
total_time += time.time() - start_time
if step % args.logging_steps == 0:
print("step %d - %.3fs/step" % (step, total_time / args.logging_steps))
total_time = 0.0
results = select_sum(ids, scores, tokenizer, args.max_dec_len, args.num_return_sequences)
pred_ref.extend(results)
start_time = time.time()
print("Generation cost time:", time.time() - time_begin)
with open(args.output_path, "w", encoding="utf-8") as fout:
for ref in pred_ref:
fout.write(ref + "\n")
if __name__ == "__main__":
args = parse_args()
print_args(args)
run(args)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import time
import paddle
import paddle.distributed as dist
import paddle.nn.functional as F
from gen_utils import create_data_loader, print_args, select_sum, set_seed
from paddle.optimizer import AdamW
from paddlenlp.datasets import load_dataset
from paddlenlp.metrics import BLEU
from paddlenlp.transformers import (
BasicTokenizer,
LinearDecayWithWarmup,
UNIMOLMHeadModel,
UNIMOTokenizer,
)
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--dataset_name', type=str, default='dureader_qg', help='The name of the dataset to load.')
parser.add_argument('--model_name_or_path', type=str, default='unimo-text-1.0', help='The path or shortcut name of the pre-trained model.')
parser.add_argument("--train_file", type=str, required=False, default=None, help="Train data path.")
parser.add_argument("--predict_file", type=str, required=False, default=None, help="Predict data path.")
parser.add_argument('--save_dir', type=str, default='./checkpoints', help='The directory where the checkpoints will be saved.')
parser.add_argument('--logging_steps', type=int, default=100, help='Log every X updates steps.')
parser.add_argument('--save_steps', type=int, default=1000, help='Save checkpoint every X updates steps.')
parser.add_argument('--seed', type=int, default=1, help='Random seed for initialization.')
parser.add_argument('--batch_size', type=int, default=16, help='Batch size per GPU/CPU for training.')
parser.add_argument('--learning_rate', type=float, default=5e-5, help='The initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=0.01, help='The weight decay for optimizer.')
parser.add_argument('--epochs', type=int, default=3, help='Total number of training epochs to perform.')
parser.add_argument('--warmup_proportion', type=float, default=0.02, help='The number of warmup steps.')
parser.add_argument('--max_grad_norm', type=float, default=1.0, help='The max value of grad norm.')
parser.add_argument('--beta1', type=float, default=0.9, help='beta1')
parser.add_argument('--beta2', type=float, default=0.98, help='beta2')
parser.add_argument('--epsilon', type=float, default=1e-6, help='epsilon')
parser.add_argument('--max_seq_len', type=int, default=512, help='The maximum sequence length of training.')
parser.add_argument('--max_target_len', type=int, default=30, help='The maximum target sequence length of training.')
parser.add_argument('--max_title_len', type=int, default=30, help='The maximum title sequence length of training.')
parser.add_argument('--max_dec_len', type=int, default=20, help='The maximum sequence length of decoding.')
parser.add_argument('--min_dec_len', type=int, default=3, help='The minimal sequence length of decoding.')
parser.add_argument('--num_return_sequences', type=int, default=1, help='The numbers of returned sequences for one input in generation.')
parser.add_argument('--decode_strategy', type=str, default='beam_search', help='The decode strategy in generation.')
parser.add_argument('--top_k', type=int, default=0, help='The number of highest probability vocabulary tokens to keep for top-k sampling.')
parser.add_argument('--temperature', type=float, default=1.0, help='The value used to module the next token probabilities.')
parser.add_argument('--top_p', type=float, default=1.0, help='The cumulative probability for top-p sampling.')
parser.add_argument('--num_beams', type=int, default=6, help='The number of beams for beam search.')
parser.add_argument('--length_penalty', type=float, default=1.2, help='The exponential penalty to the sequence length for beam search.')
parser.add_argument('--device', type=str, default='gpu', help='The device to select for training the model.')
parser.add_argument('--output_path', type=str, default='./predict.txt', help='The file path where the infer result will be saved.')
parser.add_argument("--do_train", action='store_true', help="Whether to train the model.")
parser.add_argument("--do_predict", action='store_true', help="Whether to eval and predict.")
parser.add_argument("--template", type=int, default=1, help="The template used during training, select from [0, 1, 2, 3, 4].")
args = parser.parse_args()
return args
# yapf: enable
def calc_bleu_n(preds, targets, n_size=4):
assert len(preds) == len(targets), (
"The length of pred_responses should be equal to the length of "
"target_responses. But received {} and {}.".format(len(preds), len(targets))
)
bleu = BLEU(n_size=n_size)
tokenizer = BasicTokenizer()
for pred, target in zip(preds, targets):
pred_tokens = tokenizer.tokenize(pred)
target_token = tokenizer.tokenize(target)
bleu.add_inst(pred_tokens, [target_token])
print("\n" + "*" * 15)
print("The auto evaluation result is:")
print("BLEU-" + str(n_size) + ":", bleu.score())
return bleu.score()
def calc_bleu(preds, targets):
calc_bleu_n(preds, targets, 1)
calc_bleu_n(preds, targets, 2)
calc_bleu_n(preds, targets, 3)
bleu4_score = calc_bleu_n(preds, targets, 4)
return bleu4_score
def read_file(file):
with open(file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip()
if not line:
continue
line = json.loads(line)
yield line
def save_ckpt(model, tokenizer, save_dir, name):
output_dir = os.path.join(save_dir, "model_{}".format(name))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
def run(args):
paddle.set_device(args.device)
world_size = dist.get_world_size()
if world_size > 1:
dist.init_parallel_env()
set_seed(args.seed)
model = UNIMOLMHeadModel.from_pretrained(args.model_name_or_path)
tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path)
if world_size > 1:
model = paddle.DataParallel(model)
if args.train_file:
train_ds = load_dataset(read_file, file=args.train_file, lazy=False)
else:
train_ds = load_dataset(args.dataset_name, splits="train", data_files=args.train_file)
if args.predict_file:
dev_ds = load_dataset(read_file, file=args.predict_file, lazy=False)
else:
dev_ds = load_dataset(args.dataset_name, splits="dev", data_files=args.predict_file)
train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, "train")
dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, "test")
if args.do_train:
num_training_steps = args.epochs * len(train_data_loader)
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
optimizer = AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
beta1=args.beta1,
beta2=args.beta2,
epsilon=args.epsilon,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm),
)
step = 0
total_time = 0.0
best_bleu4 = 0
for epoch in range(args.epochs):
print("\nEpoch %d/%d" % (epoch + 1, args.epochs))
batch_start_time = time.time()
for inputs in train_data_loader:
step += 1
labels = inputs[-1]
logits = model(*inputs[:-1])
labels = paddle.nn.functional.one_hot(labels, num_classes=logits.shape[-1])
labels = paddle.nn.functional.label_smooth(labels)
loss = F.cross_entropy(logits, labels, soft_label=True)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
total_time += time.time() - batch_start_time
if step % args.logging_steps == 0:
ppl = paddle.exp(loss)
print(
"step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step"
% (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)
)
total_time = 0.0
if step % args.save_steps == 0 or step >= num_training_steps:
if dist.get_rank() == 0:
save_ckpt(model, tokenizer, args.save_dir, step)
print("Saved step {} model.\n".format(step))
if args.do_predict:
model_eval = model._layers if isinstance(model, paddle.DataParallel) else model
bleu4 = evaluation(model_eval, dev_data_loader, args, tokenizer)
if bleu4 > best_bleu4:
print("best BLEU-4 performence has been updated: %.5f --> %.5f" % (best_bleu4, bleu4))
best_bleu4 = bleu4
save_ckpt(model, tokenizer, args.save_dir, "best")
batch_start_time = time.time()
print("\nTraining completed.")
elif args.do_predict:
model_eval = model._layers if isinstance(model, paddle.DataParallel) else model
evaluation(model_eval, dev_data_loader, args, tokenizer)
@paddle.no_grad()
def evaluation(model, data_loader, args, tokenizer):
print("\nEval begin...")
model.eval()
pred_ref = []
time_begin = time.time()
total_time = 0.0
start_time = time.time()
for step, inputs in enumerate(data_loader, 1):
input_ids, token_type_ids, position_ids, attention_mask = inputs
ids, scores = model.generate(
input_ids=input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask,
max_length=args.max_dec_len,
min_length=args.min_dec_len,
decode_strategy=args.decode_strategy,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
num_beams=args.num_beams,
length_penalty=args.length_penalty,
num_return_sequences=args.num_return_sequences,
bos_token_id=tokenizer.cls_token_id,
eos_token_id=tokenizer.mask_token_id,
)
total_time += time.time() - start_time
if step % args.logging_steps == 0:
print("step %d - %.3fs/step" % (step, total_time / args.logging_steps))
total_time = 0.0
results = select_sum(ids, scores, tokenizer, args.max_dec_len, args.num_return_sequences)
pred_ref.extend(results)
start_time = time.time()
print("Generation cost time:", time.time() - time_begin)
with open(args.output_path, "w", encoding="utf-8") as fout:
for ref in pred_ref:
fout.write(ref + "\n")
with open(args.output_path + ".reference.txt", "w", encoding="utf-8") as fout:
targets = [example["target"] for example in data_loader.dataset]
for target in targets:
fout.write(target + "\n")
print("\nSave inference result into: %s" % args.output_path)
if "target" in data_loader.dataset[0].keys():
targets = [example["target"] for example in data_loader.dataset]
bleu4_score = calc_bleu(pred_ref, targets)
model.train()
return bleu4_score
if __name__ == "__main__":
args = parse_args()
print_args(args)
run(args)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--source_file_path', type=str, default=None, help='the source json file path')
parser.add_argument('--target_dir_path', type=str, default=None, help='the target dir path')
parser.add_argument('--test_sample_num', type=int, default=0, help='the test sample number when preparing qa system data')
parser.add_argument('--train_sample_num', type=int, default=0, help='the test sample number when preparing qa system data')
parser.add_argument('--all_sample_num', type=int, default=None, help='the all sample number when preparing qa system data')
args = parser.parse_args()
return args
# yapf: enable
def convert_json_to_data(json_file, out_dir, test_sample_num, train_sample_num, all_sample_num=None):
with open(json_file, "r", encoding="utf-8") as rf, open(
os.path.join(out_dir, "qa_pair.csv"), "w", encoding="utf-8"
) as qa_pair_wf, open(os.path.join(out_dir, "qac_triple.csv"), "w", encoding="utf-8") as qac_triple_wf, open(
os.path.join(out_dir, "train.csv"), "w", encoding="utf-8"
) as train_wf, open(
os.path.join(out_dir, "q_corpus.csv"), "w", encoding="utf-8"
) as q_corpus_wf, open(
os.path.join(out_dir, "dev.csv"), "w", encoding="utf-8"
) as test_wf:
for i, json_line in enumerate(rf.readlines()):
line_dict = json.loads(json_line)
context = line_dict["context"]
if "answer" in line_dict and "question" in line_dict:
answer = line_dict["answer"]
question = line_dict["question"]
elif "synthetic_answer" in line_dict and "synthetic_question" in line_dict:
answer = line_dict["synthetic_answer"]
question = line_dict["synthetic_question"]
if isinstance(question, list):
question = question[0]
else:
question = question
if i < test_sample_num:
test_wf.write(question.replace("\n", " ").replace("\t", " ").strip() + "\n")
elif test_sample_num <= i < test_sample_num + train_sample_num:
train_wf.write(question.replace("\n", " ").replace("\t", " ").strip() + "\n")
if not all_sample_num or i < all_sample_num:
qa_pair_wf.write(
question.replace("\n", " ").replace("\t", " ").strip()
+ "\t"
+ answer.replace("\n", " ").replace("\t", " ").strip()
+ "\n"
)
qac_triple_wf.write(
question.replace("\n", " ").replace("\t", " ").strip()
+ "\t"
+ answer.replace("\n", " ").replace("\t", " ").strip()
+ "\t"
+ context
+ "\n"
)
q_corpus_wf.write(question.replace("\n", " ").replace("\t", " ").strip() + "\n")
if __name__ == "__main__":
args = parse_args()
convert_json_to_data(
args.source_file_path, args.target_dir_path, args.test_sample_num, args.train_sample_num, args.all_sample_num
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
# yapf: disable
def parse_args():
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--source_file_path', type=str, default=None, help='the source json file path')
parser.add_argument('--target_dir', type=str, default='data', help='the target file path')
parser.add_argument('--do_answer_prompt', action="store_true", help="is use answer prompt")
parser.add_argument('--do_len_prompt', action="store_true", help="is use length prompt")
parser.add_argument('--do_domain_prompt', action="store_true", help="is use domain prompt")
parser.add_argument('--domain', type=str, default=None, help='the domain of the dataset when using domain prompt')
args = parser.parse_args()
return args
# yapf: enable
def convert_from_json_to_answer_extraction_format(
json_file, output_path, domain=None, do_answer_prompt=True, do_len_prompt=False, do_domain_prompt=False
):
with open(json_file, "r", encoding="utf-8") as rf, open(output_path, "w", encoding="utf-8") as wf:
for line in rf:
json_line = json.loads(line)
context = json_line["context"]
answer = json_line["answer"]
# Cut the abnormally long sample
if len(answer) > 300:
answer = answer[:300]
begin_id = context.find(answer)
assert begin_id != -1, "'" + answer + "' is not found in " + context
end_id = begin_id + len(answer)
result = {"text": answer, "start": begin_id, "end": end_id}
if do_answer_prompt:
outdict = {
"content": context,
"result_list": [result],
"prompt": "答案",
}
wf.write(json.dumps(outdict, ensure_ascii=False) + "\n")
if do_len_prompt:
if len(answer) < 10:
len_prompat = "短答案"
elif len(answer) < 20:
len_prompat = "中短答案"
elif len(answer) < 30:
len_prompat = "中长答案"
else:
len_prompat = "长答案"
len_outdict = {
"content": context,
"result_list": [result],
"prompt": len_prompat,
}
wf.write(json.dumps(len_outdict, ensure_ascii=False) + "\n")
if do_domain_prompt and domain:
domain_outdict = {
"content": context,
"result_list": [result],
"prompt": domain,
}
wf.write(json.dumps(domain_outdict, ensure_ascii=False) + "\n")
def convert_from_json_to_question_generation_format(json_file, output_path, tokenizer=None):
with open(json_file, "r", encoding="utf-8") as rf, open(output_path, "w", encoding="utf-8") as wf:
for line in rf:
json_line = json.loads(line)
context = json_line["context"]
answer = json_line["answer"]
# Cut the abnormally long sample
if len(answer) > 300:
answer = answer[:300]
question = json_line["question"]
outdict = {
"question": question,
"answer": answer,
"context": context,
}
wf.write(json.dumps(outdict, ensure_ascii=False) + "\n")
def convert_from_json_to_filtration_format(json_file, output_path, tokenizer=None):
with open(json_file, "r", encoding="utf-8") as rf, open(output_path, "w", encoding="utf-8") as wf:
for line in rf:
json_line = json.loads(line)
context = json_line["context"]
answer = json_line["answer"]
# Cut the abnormally long sample
if len(answer) > 300:
answer = answer[:300]
question = json_line["question"]
prefix = "问题:" + question + "上下文:"
content = prefix + context
begin_id = context.find(answer)
assert begin_id != -1, "'" + answer + "' is not found in " + context
end_id = begin_id + len(answer)
begin_id += len(prefix)
end_id += len(prefix)
result = {"text": answer, "start": begin_id, "end": end_id}
outdict = {
"content": content,
"result_list": [result],
"prompt": "答案",
}
wf.write(json.dumps(outdict, ensure_ascii=False) + "\n")
if __name__ == "__main__":
args = parse_args()
answer_extraction_target_file_path = os.path.join(
args.target_dir, "answer_extraction", os.path.basename(args.source_file_path)
)
if not os.path.exists(os.path.dirname(answer_extraction_target_file_path)):
os.makedirs(os.path.dirname(answer_extraction_target_file_path))
convert_from_json_to_answer_extraction_format(
json_file=args.source_file_path,
output_path=answer_extraction_target_file_path,
domain=args.domain,
do_answer_prompt=args.do_answer_prompt,
do_len_prompt=args.do_len_prompt,
do_domain_prompt=args.do_domain_prompt,
)
question_generation_target_file_path = os.path.join(
args.target_dir, "question_generation", os.path.basename(args.source_file_path)
)
if not os.path.exists(os.path.dirname(question_generation_target_file_path)):
os.makedirs(os.path.dirname(question_generation_target_file_path))
convert_from_json_to_question_generation_format(
json_file=args.source_file_path, output_path=question_generation_target_file_path
)
filtration_target_file_path = os.path.join(args.target_dir, "filtration", os.path.basename(args.source_file_path))
if not os.path.exists(os.path.dirname(filtration_target_file_path)):
os.makedirs(os.path.dirname(filtration_target_file_path))
convert_from_json_to_filtration_format(json_file=args.source_file_path, output_path=filtration_target_file_path)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pprint import pprint
from pipelines.document_stores import FAISSDocumentStore
from pipelines.nodes import (
AnswerExtractor,
DensePassageRetriever,
ErnieRanker,
QAFilter,
QuestionGenerator,
)
from pipelines.pipelines import QAGenerationPipeline, SemanticSearchPipeline
from pipelines.utils import convert_files_to_dicts, print_documents
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run dense_qa system, defaults to gpu.")
parser.add_argument("--index_name", default='faiss_index', type=str, help="The ann index name of FAISS.")
parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
parser.add_argument("--doc_dir", default="data/my_data", type=str, help="The question-answer pairs file to be loaded when building ANN index.")
parser.add_argument("--source_file", default=None, type=str, help="The source raw texts file to be loaded when creating question-answer pairs.")
args = parser.parse_args()
# yapf: enable
def dense_faq_pipeline():
use_gpu = True if args.device == "gpu" else False
faiss_document_store = "faiss_document_store.db"
if os.path.exists(args.index_name) and os.path.exists(faiss_document_store):
# connect to existed FAISS Index
document_store = FAISSDocumentStore.load(args.index_name)
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="rocketqa-zh-dureader-query-encoder",
passage_embedding_model="rocketqa-zh-dureader-query-encoder",
max_seq_len_query=args.max_seq_len_query,
max_seq_len_passage=args.max_seq_len_passage,
batch_size=args.retriever_batch_size,
use_gpu=use_gpu,
embed_title=False,
)
else:
dicts = convert_files_to_dicts(
dir_path=args.doc_dir, split_paragraphs=True, split_answers=True, encoding="utf-8"
)
if os.path.exists(args.index_name):
os.remove(args.index_name)
if os.path.exists(faiss_document_store):
os.remove(faiss_document_store)
document_store = FAISSDocumentStore(embedding_dim=768, faiss_index_factory_str="Flat")
document_store.write_documents(dicts)
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="rocketqa-zh-dureader-query-encoder",
passage_embedding_model="rocketqa-zh-dureader-query-encoder",
max_seq_len_query=args.max_seq_len_query,
max_seq_len_passage=args.max_seq_len_passage,
batch_size=args.retriever_batch_size,
use_gpu=use_gpu,
embed_title=False,
)
# update Embedding
document_store.update_embeddings(retriever)
# save index
document_store.save(args.index_name)
# Ranker
ranker = ErnieRanker(model_name_or_path="rocketqa-zh-dureader-cross-encoder", use_gpu=use_gpu)
pipe = SemanticSearchPipeline(retriever, ranker)
pipeline_params = {"Retriever": {"top_k": 50}, "Ranker": {"top_k": 1}}
prediction = pipe.run(query="世界上最早的地雷发明者是谁?", params=pipeline_params)
print_documents(prediction, print_name=False, print_meta=True)
def qa_generation_pipeline():
answer_extractor = AnswerExtractor(
model="uie-base-answer-extractor",
device=args.device,
schema=["答案"],
max_answer_candidates=3,
position_prob=0.01,
batch_size=1,
)
question_generator = QuestionGenerator(
model="unimo-text-1.0-question-generation",
device=args.device,
num_return_sequences=2,
)
qa_filter = QAFilter(
model="uie-base-qa-filter",
device=args.device,
schema=["答案"],
position_prob=0.1,
)
pipe = QAGenerationPipeline(
answer_extractor=answer_extractor, question_generator=question_generator, qa_filter=qa_filter
)
pipeline_params = {"QAFilter": {"is_filter": True}}
# list example
meta = [
"世界上最早的电影院是美国洛杉矶的“电气剧场”,建于1902年。",
"以脸书为例,2020年时,54%的成年人表示,他们从该平台获取新闻。而现在,这个数字下降到了44%。与此同时,YouTube在过去几年里一直保持平稳,约有三分之一的用户在该平台上获取新闻。",
]
prediction = pipe.run(meta=meta, params=pipeline_params)
prediction = prediction["filtered_cqa_triples"]
pprint(prediction)
# file example
if args.source_file:
meta = []
with open(args.source_file, "r", encoding="utf-8") as rf:
for line in rf:
meta.append(line.strip())
prediction = pipe.run(meta=meta, params=pipeline_params)
prediction = prediction["filtered_cqa_triples"]
if not os.path.exists(args.doc_dir):
os.makedirs(args.doc_dir)
with open(os.path.join(args.doc_dir, "generated_qa_pairs.txt"), "w", encoding="utf-8") as wf:
for pair in prediction:
wf.write(pair["synthetic_question"].strip() + "\t" + pair["synthetic_answer"].strip() + "\n")
if __name__ == "__main__":
qa_generation_pipeline()
dense_faq_pipeline()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment