Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "serving_server" \
--client_path "serving_client" \
--fetch_alias_names "output_embedding"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python feature_extract.py \
--model_dir=./output \
--model_name_or_path rocketqa-zh-base-query-encoder \
--corpus_file "data/corpus.csv"
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# gpu
python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--params_path "checkpoints/model_100/model_state.pdparams" \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 64 \
--recall_num 10 \
--similar_text_pair "data/test_pair.csv" \
--corpus_file "data/corpus.csv"
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python -u -m paddle.distributed.launch --gpus='1' \
train.py \
--device gpu \
--model_name_or_path rocketqa-zh-base-query-encoder \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 50 \
--eval_steps 50 \
--max_seq_length 64 \
--dropout 0.2 \
--output_emb_size 256 \
--dup_rate 0.1 \
--rdrop_coef 0.1 \
--train_set_file "./data/train_aug.csv"
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import random
import time
from functools import partial
import numpy as np
import paddle
from data import (
convert_example,
create_dataloader,
read_simcse_text,
read_text_pair,
word_repetition,
)
from model import SimCSE
from scipy import stats
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer, LinearDecayWithWarmup
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=0, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--epochs", default=1, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--save_steps', type=int, default=10000, help="Step interval for saving checkpoint.")
parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override ecpochs.")
parser.add_argument('--eval_steps', type=int, default=10000, help="Step interval for evaluation.")
parser.add_argument("--train_set_file", type=str, required=True, help="The full path of train_set_file.")
parser.add_argument("--margin", default=0.0, type=float, help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--is_unsupervised", action='store_true', help="Whether to use unsupervised training")
parser.add_argument("--dropout", default=0.1, type=float, help="Dropout for pretrained model encoder.")
parser.add_argument("--dup_rate", default=0.32, type=float, help="duplicate rate for word repetition.")
parser.add_argument("--infer_with_fc_pooler", action='store_true', help="Whether use fc layer after cls embedding or not for when infer.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--rdrop_coef", default=0.0, type=float, help="The coefficient of KL-Divergence loss in R-Drop paper, for more detail please refer to https://arxiv.org/abs/2106.14448), if rdrop_coef > 0 then R-Drop works")
args = parser.parse_args()
def set_seed(seed):
"""sets random seed"""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def do_evaluate(model, tokenizer, data_loader, with_pooler=False):
model.eval()
total_num = 0
spearman_corr = 0.0
sims = []
labels = []
for batch in data_loader:
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids, label = batch
total_num += len(label)
query_cls_embedding = model.get_pooled_embedding(
query_input_ids, query_token_type_ids, with_pooler=with_pooler)
title_cls_embedding = model.get_pooled_embedding(title_input_ids, title_token_type_ids, with_pooler=with_pooler)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
sims.append(cosine_sim.numpy())
labels.append(label.numpy())
sims = np.concatenate(sims, axis=0)
labels = np.concatenate(labels, axis=0)
spearman_corr = stats.spearmanr(labels, sims).correlation
model.train()
return spearman_corr, total_num
def do_train():
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
if args.is_unsupervised:
train_ds = load_dataset(read_simcse_text, data_path=args.train_set_file, is_test=False, lazy=False)
else:
train_ds = load_dataset(read_text_pair, data_path=args.train_set_file, is_test=False, lazy=False)
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length)
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
),
):
return [data for data in fn(samples)]
train_data_loader = create_dataloader(
train_ds,
mode='train',
batch_size=args.batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_func)
model = SimCSE(
pretrained_model,
margin=args.margin,
scale=args.scale,
output_emb_size=args.output_emb_size)
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)
print("warmup from:{}".format(args.init_from_ckpt))
model = paddle.DataParallel(model)
num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_data_loader) * args.epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params)
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
if random.random() < 0.2:
title_input_ids, title_token_type_ids = query_input_ids, query_token_type_ids
query_input_ids, query_token_type_ids = word_repetition(query_input_ids, query_token_type_ids, args.dup_rate)
title_input_ids, title_token_type_ids = word_repetition(title_input_ids, title_token_type_ids, args.dup_rate)
loss, kl_loss = model(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids)
loss = loss + kl_loss * args.rdrop_coef
global_step += 1
if global_step % 10 == 0 and rank == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss,
10 / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
if global_step % args.save_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, 'model_state.pdparams')
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
if args.max_steps > 0 and global_step >= args.max_steps:
return
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, 'model_state.pdparams')
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
if __name__ == "__main__":
do_train()
# 政务问答检索式 FAQ System
**目录**
* [1. 场景概述](#场景概述)
* [2. 系统特色](#系统特色)
* [3. 政务问答系统方案](#政务问答系统方案)
* [4. 动手实践——搭建自己的端到端检索式问答系统](#动手实践——搭建自己的端到端检索式问答系统)
<a name="场景概述"></a>
## 1. 场景概述
政府工作人员往往要做很多政策解读等工作,费时费力还耗费大量的人力,在政府内部,工作人员往往积累了很多问答对,但是不知道怎么构建一个问答系统来辅助工作人员提升日常工作效率,简化工作流程。
<a name="系统特色"></a>
## 2. 系统特色
+ 低门槛
+ 手把手搭建检索式 FAQ System
+ 无需相似 Query-Query Pair 标注数据也能构建 FAQ System
+ 效果好
+ 业界领先的检索预训练模型: RocketQA Dual Encoder
+ 针对无标注数据场景的领先解决方案: 检索预训练模型 + 增强的无监督语义索引微调
+ 性能快
+ 基于 Paddle Inference 快速抽取向量
+ 基于 Milvus 快速查询和高性能建库
+ 基于 Paddle Serving 高性能部署
<a name="政务问答系统方案"></a>
## 3. 政务问答系统方案
### 3.1 技术方案和评估指标
#### 3.1.1 技术方案
**语义索引**:针对政务问答只有问答对的场景,我们提供了一个 融合SimCSE 和 WR (word reptition)策略的无监督的解决方案。
#### 3.1.2 评估指标
* 该政务问答系统使用的指标是 Recall@K,表示的是预测的前topK(从最后的按得分排序的召回列表中返回前K个结果)结果和语料库中真实的前 K 个相关结果的重叠率,衡量的是检索系统的查全率。
### 3.2 数据说明
数据集来源于疫情政务问答比赛数据,包括源文本,问题和答案。
| 阶段 |模型 | 训练集 | 评估集(用于评估模型效果) | 召回库 |
| ------------ | ------------ |------------ | ------------ | ------------ |
| 召回 | SimCSE | 4000 | 1000 | 5000 |
其中评估集的问题对的构造使用了中英文回译的方法,总共有1000条评估集,其中500条数据使用的是百度翻译的API,详情请参考[百度翻译](https://fanyi-api.baidu.com/?fr=simultaneous),另外500条数据使用了SimBERT模型生成的同义句。
```
├── data # 数据集
├── train.csv # 无监督训练集
├── test_pair.csv # 测试集,用于评估模型的效果
├── corpus.csv # 构建召回的数据,用于评估模型的召回效果
├── qa_pair.csv # 问答对,问题对应的答案
```
数据集的下载链接为: [faq_data](https://paddlenlp.bj.bcebos.com/applications/faq_data.zip)
### 3.3 代码说明
```
|—— data.py # 数据读取、数据转换等预处理逻辑
|—— model.py # SimCSE模型
|—— train.py # SimCSE训练主脚本
|—— ann_util.py # Ann 建索引库相关函数
|—— config.py # Milvus 配置文件
|—— evaluate.py # 召回评估文件
|—— recall.py # 基于训练好的语义索引模型,从召回库中召回给定文本的相似文本
|—— export_model.py # 动态图转换成静态图
|—— export_to_serving.py # 静态图转 Serving
|—— feature_extract.py # 批量提取文本的特征向量
|—— milvus_util.py # Milvus的插入和召回类
|—— vector_insert.py # 向 Milvus 引擎插入向量的函数
|—— run_system.py # Client Server 模式客户端,向 server 发送文本,得到向量后,利用milvus引擎进行检索
|—— scripts
|—— export_model.sh # 动态图转换成静态图脚本
|—— evaluate.sh # 评估 bash 版本
|—— run_build_index.sh # 构建索引 bash 版本
|—— train.sh # 训练 bash 版本
|—— feature_extract.sh # 向量抽取 bash 版本
|—— export_to_serving.sh # Paddle Inference 转 Serving 的 bash 脚本
|—— deploy
|—— python
|—— rpc_client.py # Paddle Serving 的 Client 端
|—— web_service.py # Paddle Serving 的 Serving 端
|—— config_nlp.yml # Paddle Serving 的配置文件
```
### 3.3 效果评估
| 模型 | Recall@1 |Recall@10 |
| ------------ | ------------ |--------- |
| ERNIE1.0 + SimCSE | 68.068 | 85.686|
| RocketQA | 81.381 | 96.997|
| RocketQA + SimCSE | 83.283 | 97.297|
| RocketQA + SimCSE + WR | **83.584** | **97.497**|
<a name="动手实践——搭建自己的端到端检索式问答系统"></a>
## 4. 动手实践——搭建自己的端到端检索式问答系统
### 4.1 环境安装
在运行下面的代码之前,安装相关的依赖,运行下面的命令:
```
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### 4.2 无监督训练
```
python -u -m paddle.distributed.launch --gpus '0' \
train.py \
--device gpu \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 50 \
--max_seq_length 64 \
--dropout 0.2 \
--output_emb_size 256 \
--dup_rate 0.3 \
--train_set_file "./data/train.csv"
```
参数含义说明
* `device`: 使用 cpu/gpu 进行训练
* `save_dir`: 模型存储路径
* `batch_size`: 训练的batch size的大小
* `learning_rate`: 训练的学习率的大小
* `epochs`: 训练的epoch数
* `save_steps`: 模型存储 checkpoint 的间隔 steps 个数
* `max_seq_length`: 输入序列的最大长度
* `dropout`: SimCSE的dropout参数
* `output_emb_size`: Transformer 顶层输出的文本向量维度
* `dup_rate` : SimCSE的 Word reptition 策略的重复率
* `train_set_file`: 训练集文件
也可以使用下面的bash脚本:
```
sh scripts/train.sh
```
### 4.3 评估
效果评估分为 4 个步骤:
a. 获取Doc端Embedding
基于语义索引模型抽取出Doc样本库的文本向量。
b. 采用hnswlib对Doc端Embedding建库
使用 ANN 引擎构建索引库(这里基于 [hnswlib](https://github.com/nmslib/hnswlib) 进行 ANN 索引)
c. 获取question的Embedding并查询相似结果
基于语义索引模型抽取出评估集 *Source Text* 的文本向量,在第 2 步中建立的索引库中进行 ANN 查询,召回 Top10 最相似的 *Target Text*, 产出评估集中 *Source Text* 的召回结果 `recall_result` 文件。
d. 评估
基于评估集 `test.csv` 和召回结果 `recall_result` 计算评估指标 Recall@k,其中k取值1,5,10。
运行如下命令进行 ANN 建库、召回,产出召回结果数据 `recall_result`
```
python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "checkpoints/model_150/model_state.pdparams" \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 64 \
--recall_num 10 \
--similar_text_pair "data/test_pair.csv" \
--corpus_file "data/corpus.csv"
```
参数含义说明
* `device`: 使用 cpu/gpu 进行训练
* `recall_result_dir`: 召回结果存储目录
* `recall_result_file`: 召回结果的文件名
* `params_path`: 待评估模型的参数文件名
* `hnsw_m`: hnsw 算法相关参数,保持默认即可
* `hnsw_ef`: hnsw 算法相关参数,保持默认即可
* `output_emb_size`: Transformer 顶层输出的文本向量维度
* `recall_num`: 对 1 个文本召回的相似文本数量
* `similar_text_pair`: 由相似文本对构成的评估集
* `corpus_file`: 召回库数据 corpus_file
也可以使用下面的bash脚本:
```
sh scripts/run_build_index.sh
```
run_build_index.sh还包含cpu和gpu运行的脚本,默认是gpu的脚本
接下来,运行如下命令进行效果评估,产出Recall@1, Recall@5, Recall@10 指标:
```
python -u evaluate.py \
--similar_text_pair "data/test_pair.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 10
```
也可以使用下面的bash脚本:
```
sh scripts/evaluate.sh
```
输出如下的结果:
```
recall@1=83.784
recall@5=94.995
recall@10=96.997
```
参数含义说明
* `similar_text_pair`: 由相似文本对构成的评估集 semantic_similar_pair.tsv
* `recall_result_file`: 针对评估集中第一列文本 *Source Text* 的召回结果
* `recall_num`: 对 1 个文本召回的相似文本数量
## 4.4 模型部署
模型部署模块首先要把动态图转换成静态图,然后转换成serving的格式。
### 动转静导出
首先把动态图模型转换为静态图:
```
python export_model.py --params_path checkpoints/model_150/model_state.pdparams --output_path=./output
```
也可以运行下面的bash脚本:
```
sh scripts/export_model.sh
```
### 问答检索引擎
模型准备结束以后,开始搭建 Milvus 的语义检索引擎,用于语义向量的快速检索,本项目使用[Milvus](https://milvus.io/)开源工具进行向量检索,Milvus 的搭建教程请参考官方教程 [Milvus官方安装教程](https://milvus.io/cn/docs/v1.1.1/milvus_docker-cpu.md)本案例使用的是 Milvus 的1.1.1 CPU版本,建议使用官方的 Docker 安装方式,简单快捷。
Milvus 搭建完系统以后就可以插入和检索向量了,首先生成 embedding 向量,每个样本生成256维度的向量:
```
python feature_extract.py \
--model_dir=./output \
--corpus_file "data/corpus.csv"
```
其中 output 目录下存放的是召回的 Paddle Inference 静态图模型。
然后向搭建好的 Milvus 系统插入向量:
```
python vector_insert.py
```
### Paddle Serving 部署
Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖,安装完依赖后就可以执行下面的步骤。
首先把生成的静态图模型导出为 Paddle Serving的格式,命令如下:
```
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "./serving_server" \
--client_path "./serving_client" \
--fetch_alias_names "output_embedding"
```
参数含义说明
* `dirname`: 需要转换的模型文件存储路径,Program 结构文件和参数文件均保存在此目录。
* `model_filename`: 存储需要转换的模型 Inference Program 结构的文件名称。如果设置为 None ,则使用 `__model__` 作为默认的文件名
* `params_filename`: 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为 None
* `server_path`: 转换后的模型文件和配置文件的存储路径。默认值为 serving_server
* `client_path`: 转换后的客户端配置文件存储路径。默认值为 serving_client
* `fetch_alias_names`: 模型输出的别名设置,比如输入的 input_ids 等,都可以指定成其他名字,默认不指定
* `feed_alias_names`: 模型输入的别名设置,比如输出 pooled_out 等,都可以重新指定成其他模型,默认不指定
也可以运行下面的 bash 脚本:
```
sh scripts/export_to_serving.sh
```
启动 Pipeline Server:
```
cd deploy/python/
python web_service.py
```
启动客户端调用 Server, 使用 POST的方式:
向服务端发送 POST 请求示例:
```
curl -X POST -k http://localhost:8090/ernie/prediction -d '{"key": ["0"], "value": ["宁夏针对哪些人员开通工伤保障绿色通道?"]}'
```
也可以使用 rpc的方式:
首先修改rpc_client.py中需要预测的样本:
```
list_data = [
"湖北省为什么鼓励缴费人通过线上缴费渠道缴费?",
"佛山市救助站有多少个救助床位"
]
```
然后运行:
```
python rpc_client.py
```
## 4.5 问答系统整个流程
问答系统使用了Client Server的模式,即抽取向量的模型部署在服务端,然后启动客户端(Client)端去访问。
```
python run_system.py
```
代码内置的测试用例为:
```
list_data = ["嘉定区南翔镇实行双门长制“门长”要求落实好哪些工作?"]
```
会输出如下的结果:
```
......
Extract feature time to cost :0.01161503791809082 seconds
Search milvus time cost is 0.004535675048828125 seconds
嘉定区南翔镇实行双门长制“门长”要求落实好哪些工作? 拦、查、问、测、记 1.2107588152551751e-12
上海市黄浦区老西门街道建立的党建责任区包干机制内容是什么? 街道工作人员担任楼宇联络员,分片区对接商务楼宇所属的物业公司,引导楼宇企业共同落实严防严控任务 0.4956303834915161
上海市街道执行“四个统一”具体指什么? 统一由居委会干部在统一时间(每周三、五下午),递交至统一地点(社区事务受理服务中心专设窗口),街道统一收集至後台 0.6684658527374268
怀柔区城管委在加强监督检查方面是如何落实的? 严格落实四方责任,保证每周2~3次深入环卫、电、气、热、公共自行车、垃圾处置等单位进行巡查,督促企业做好防疫工作,协调复工复产中存在的问题,确保安全复工复产有效落实。 0.7147952318191528
华新镇“亮牌分批复工”工作方案具体内容是什么? 所有店铺一律先贴“红牌”禁止经营,经相关部门审批後,再换贴“蓝牌”准许复工。 0.7162970900535583
.....
```
输出的结果包括特征提取和检索的时间,还包含检索出来的问答对。
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hnswlib
import numpy as np
from paddlenlp.utils.log import logger
def build_index(args, data_loader, model):
index = hnswlib.Index(space="ip", dim=args.output_emb_size if args.output_emb_size > 0 else 768)
# Initializing index
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
index.init_index(max_elements=args.hnsw_max_elements, ef_construction=args.hnsw_ef, M=args.hnsw_m)
# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
index.set_ef(args.hnsw_ef)
# Set number of threads used during batch search/construction
# By default using all available cores
index.set_num_threads(16)
logger.info("start build index..........")
all_embeddings = []
for text_embeddings in model.get_semantic_embedding(data_loader):
all_embeddings.append(text_embeddings.numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
index.add_items(all_embeddings)
logger.info("Total index number:{}".format(index.get_current_count()))
return index
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from milvus import IndexType, MetricType
MILVUS_HOST = "10.21.226.173"
MILVUS_PORT = 8530
collection_param = {"dimension": 256, "index_file_size": 256, "metric_type": MetricType.L2}
index_type = IndexType.IVF_FLAT
index_param = {"nlist": 1000}
top_k = 100
search_param = {"nprobe": 20}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
import paddle
def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == "train" else False
if mode == "train":
batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
if "label" in key:
# do_evaluate
result += [example["label"]]
else:
# do_train
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def convert_example_test(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def read_simcse_text(data_path):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip()
yield {"text_a": data, "text_b": data}
def read_text_pair(data_path, is_test=False):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip().split("\t")
if is_test is False:
if len(data) != 3:
continue
yield {"text_a": data[0], "text_b": data[1], "label": data[2]}
else:
if len(data) != 2:
continue
yield {"text_a": data[0], "text_b": data[1]}
def gen_text_file(similar_text_pair_file):
text2similar_text = {}
texts = []
with open(similar_text_pair_file, "r", encoding="utf-8") as f:
for line in f:
splited_line = line.rstrip().split("\t")
if len(splited_line) != 2:
continue
text, similar_text = line.rstrip().split("\t")
if not text or not similar_text:
continue
text2similar_text[text] = similar_text
texts.append({"text": text})
return texts, text2similar_text
def word_repetition(input_ids, token_type_ids, dup_rate=0.32):
"""Word Repetition strategy."""
input_ids = input_ids.numpy().tolist()
token_type_ids = token_type_ids.numpy().tolist()
batch_size, seq_len = len(input_ids), len(input_ids[0])
repetitied_input_ids = []
repetitied_token_type_ids = []
rep_seq_len = seq_len
for batch_id in range(batch_size):
cur_input_id = input_ids[batch_id]
actual_len = np.count_nonzero(cur_input_id)
dup_word_index = []
# If sequence length is less than 5, skip it
if actual_len > 5:
dup_len = random.randint(a=0, b=max(2, int(dup_rate * actual_len)))
# Skip cls and sep position
dup_word_index = random.sample(list(range(1, actual_len - 1)), k=dup_len)
r_input_id = []
r_token_type_id = []
for idx, word_id in enumerate(cur_input_id):
# Insert duplicate word
if idx in dup_word_index:
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
after_dup_len = len(r_input_id)
repetitied_input_ids.append(r_input_id)
repetitied_token_type_ids.append(r_token_type_id)
if after_dup_len > rep_seq_len:
rep_seq_len = after_dup_len
# Padding the data to the same length
for batch_id in range(batch_size):
after_dup_len = len(repetitied_input_ids[batch_id])
pad_len = rep_seq_len - after_dup_len
repetitied_input_ids[batch_id] += [0] * pad_len
repetitied_token_type_ids[batch_id] += [0] * pad_len
return paddle.to_tensor(repetitied_input_ids, dtype="int64"), paddle.to_tensor(
repetitied_token_type_ids, dtype="int64"
)
# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程,每个进程内构建grpcSever和DAG
# 当build_dag_each_worker=False时,框架会设置主线程grpc线程池的max_workers=worker_num
worker_num: 20
# build_dag_each_worker, False,框架在进程内创建一条DAG;True,框架会每个进程内创建多个独立的DAG
build_dag_each_worker: false
dag:
# op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False
# 使用性能分析, True,生成Timeline性能数据,对性能有一定影响;False为不使用
tracer:
interval_s: 10
# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时,不自动生成http_port
http_port: 8090
# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时,会自动将rpc_port设置为http_port+1
rpc_port: 8080
op:
ernie:
# 并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
# 当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
# client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
# ir_optim
ir_optim: True
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 1
# 计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: '2'
# Fetch结果列表,以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
fetch_list: ['output_embedding']
# 模型路径
model_config: ../../serving_server/
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
from paddle_serving_server.pipeline import PipelineClient
client = PipelineClient()
client.connect(["127.0.0.1:8080"])
list_data = ["湖北省为什么鼓励缴费人通过线上缴费渠道缴费?", "佛山市救助站有多少个救助床位"]
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
print(feed)
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
print(ret.key)
print(result.shape)
print(result)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle_serving_server.web_service import Op, WebService
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
result = []
for text in example:
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class ErnieOp(Op):
def init_op(self):
from paddlenlp.transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
def preprocess(self, input_dicts, data_id, log_id):
from paddlenlp.data import Pad, Tuple
((_, input_dict),) = input_dicts.items()
print("input dict", input_dict)
batch_size = len(input_dict.keys())
examples = []
for i in range(batch_size):
input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer)
examples.append((input_ids, segment_ids))
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # segment
),
):
return fn(samples)
input_ids, segment_ids = batchify_fn(examples)
feed_dict = {}
feed_dict["input_ids"] = input_ids
feed_dict["token_type_ids"] = segment_ids
return feed_dict, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
new_dict = {}
new_dict["output_embedding"] = str(fetch_dict["output_embedding"].tolist())
return new_dict, None, ""
class ErnieService(WebService):
def get_pipeline_response(self, read_op):
ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
return ernie_op
ernie_service = ErnieService(name="ernie")
ernie_service.prepare_pipeline_config("config_nlp.yml")
ernie_service.run_service()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--similar_text_pair", type=str, default='', help="The full path of similat pair file")
parser.add_argument("--recall_result_file", type=str, default='', help="The full path of recall result file")
parser.add_argument("--recall_num", type=int, default=10, help="Most similair number of doc recalled from corpus per query")
args = parser.parse_args()
# yapf: enable
def recall(rs, N=10):
"""
Ratio of recalled Ground Truth at topN Recalled Docs
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
>>> recall(rs, N=1)
0.333333
>>> recall(rs, N=2)
>>> 0.6666667
>>> recall(rs, N=3)
>>> 1.0
Args:
rs: Iterator of recalled flag()
Returns:
Recall@N
"""
recall_flags = [np.sum(r[0:N]) for r in rs]
return np.mean(recall_flags)
if __name__ == "__main__":
text2similar = {}
with open(args.similar_text_pair, "r", encoding="utf-8") as f:
for line in f:
text, similar_text = line.rstrip().split("\t")
text2similar[text] = similar_text
rs = []
with open(args.recall_result_file, "r", encoding="utf-8") as f:
relevance_labels = []
for index, line in enumerate(f):
if index % args.recall_num == 0 and index != 0:
rs.append(relevance_labels)
relevance_labels = []
text, recalled_text, cosine_sim = line.rstrip().split("\t")
if text2similar[text] == recalled_text:
relevance_labels.append(1)
else:
relevance_labels.append(0)
recall_N = []
recall_num = [1, 5, 10]
result = open("result.tsv", "a")
res = []
for topN in recall_num:
R = round(100 * recall(rs, N=topN), 3)
recall_N.append(str(R))
for key, val in zip(recall_num, recall_N):
print("recall@{}={}".format(key, val))
res.append(str(val))
result.write("\t".join(res) + "\n")
# print("\t".join(recall_N))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from model import SimCSE
from paddlenlp.transformers import AutoModel, AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--params_path", type=str, required=True,
default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.")
parser.add_argument("--output_path", type=str, default='./output',
help="The path of model parameter in static graph to be saved.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
output_emb_size = 256
pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh")
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
model = SimCSE(pretrained_model, output_emb_size=output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # segment_ids
],
)
# Save in static graph model.
save_path = os.path.join(args.output_path, "inference")
paddle.jit.save(model, save_path)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle_serving_client.io as serving_io
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--dirname", type=str, required=True,
default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
parser.add_argument("--model_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
parser.add_argument("--params_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
parser.add_argument("--server_path", type=str, default='./serving_server',
help="The path of server parameter in static graph to be saved.")
parser.add_argument("--client_path", type=str, default='./serving_client',
help="The path of client parameter in static graph to be saved.")
parser.add_argument("--feed_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
parser.add_argument("--fetch_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
parser.add_argument("--show_proto", type=bool, default=False,
help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
# yapf: enable
if __name__ == "__main__":
args = parser.parse_args()
serving_io.inference_model_to_serving(
dirname=args.dirname,
serving_server=args.server_path,
serving_client=args.client_path,
model_filename=args.model_filename,
params_filename=args.params_filename,
show_proto=args.show_proto,
feed_alias_names=args.feed_alias_names,
fetch_alias_names=args.fetch_alias_names,
)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import numpy as np
import paddle
from paddle import inference
from tqdm import tqdm
from paddlenlp.data import Pad, Tuple
from paddlenlp.transformers import AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
parser.add_argument("--corpus_file", type=str, required=True, help="The corpus_file path.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
args = parser.parse_args()
# yapf: enable
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class Predictor(object):
def __init__(
self,
model_dir,
device="gpu",
max_seq_length=128,
batch_size=32,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False,
):
self.max_seq_length = max_seq_length
self.batch_size = batch_size
model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as initialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8,
}
precision_mode = precision_map[precision]
if args.use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(args.cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
def predict(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the predictions labels.
"""
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # segment
),
):
return fn(samples)
all_embeddings = []
examples = []
for idx, text in enumerate(tqdm(data)):
input_ids, segment_ids = convert_example(
text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True
)
examples.append((input_ids, segment_ids))
if len(examples) >= self.batch_size:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
examples = []
if len(examples) > 0:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
all_embeddings = np.concatenate(all_embeddings, axis=0)
np.save("corpus_embedding", all_embeddings)
def read_text(file_path):
file = open(file_path)
id2corpus = {}
for idx, data in enumerate(file.readlines()):
id2corpus[idx] = data.strip()
return id2corpus
if __name__ == "__main__":
predictor = Predictor(
args.model_dir,
args.device,
args.max_seq_length,
args.batch_size,
args.use_tensorrt,
args.precision,
args.cpu_threads,
args.enable_mkldnn,
)
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
id2corpus = read_text(args.corpus_file)
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
predictor.predict(corpus_list, tokenizer)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from config import (
MILVUS_HOST,
MILVUS_PORT,
collection_param,
index_param,
index_type,
search_param,
top_k,
)
# from milvus import *
from milvus import Milvus
class VecToMilvus:
def __init__(self):
self.client = Milvus(host=MILVUS_HOST, port=MILVUS_PORT)
def has_collection(self, collection_name):
try:
status, ok = self.client.has_collection(collection_name)
return ok
except Exception as e:
print("Milvus has_table error:", e)
def creat_collection(self, collection_name):
try:
collection_param["collection_name"] = collection_name
status = self.client.create_collection(collection_param)
print(status)
return status
except Exception as e:
print("Milvus create collection error:", e)
def create_index(self, collection_name):
try:
status = self.client.create_index(collection_name, index_type, index_param)
print(status)
return status
except Exception as e:
print("Milvus create index error:", e)
def has_partition(self, collection_name, partition_tag):
try:
status, ok = self.client.has_partition(collection_name, partition_tag)
return ok
except Exception as e:
print("Milvus has partition error: ", e)
def create_partition(self, collection_name, partition_tag):
try:
status = self.client.create_partition(collection_name, partition_tag)
print("create partition {} successfully".format(partition_tag))
return status
except Exception as e:
print("Milvus create partition error: ", e)
def insert(self, vectors, collection_name, ids=None, partition_tag=None):
try:
if not self.has_collection(collection_name):
self.creat_collection(collection_name)
self.create_index(collection_name)
print("collection info: {}".format(self.client.get_collection_info(collection_name)[1]))
if (partition_tag is not None) and (not self.has_partition(collection_name, partition_tag)):
self.create_partition(collection_name, partition_tag)
status, ids = self.client.insert(
collection_name=collection_name, records=vectors, ids=ids, partition_tag=partition_tag
)
self.client.flush([collection_name])
print(
"Insert {} entities, there are {} entities after insert data.".format(
len(ids), self.client.count_entities(collection_name)[1]
)
)
return status, ids
except Exception as e:
print("Milvus insert error:", e)
class RecallByMilvus:
def __init__(self):
self.client = Milvus(host=MILVUS_HOST, port=MILVUS_PORT)
def search(self, vectors, collection_name, partition_tag=None):
try:
status, results = self.client.search(
collection_name=collection_name,
query_records=vectors,
top_k=top_k,
params=search_param,
partition_tag=partition_tag,
)
return status, results
except Exception as e:
print("Milvus recall error: ", e)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class SimCSE(nn.Layer):
def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off between
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(768, output_emb_size, weight_attr=weight_attr)
self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
@paddle.jit.to_static(
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
]
)
def get_pooled_embedding(
self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, with_pooler=True
):
# Note: cls_embedding is poolerd embedding with act tanh
sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if with_pooler is False:
cls_embedding = sequence_output[:, 0, :]
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
with_pooler=True,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask, with_pooler=with_pooler
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask, with_pooler=with_pooler
)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
return cosine_sim
def forward(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
)
cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
# substract margin from all positive samples cosine_sim()
margin_diag = paddle.full(
shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
)
cosine_sim = cosine_sim - paddle.diag(margin_diag)
# scale cosine to ease training converge
cosine_sim *= self.sacle
labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
loss = F.cross_entropy(input=cosine_sim, label=labels)
return loss
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from functools import partial
import paddle
from ann_util import build_index
from data import convert_example_test, create_dataloader, gen_id2corpus, gen_text_file
from model import SimCSE
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
from paddlenlp.utils.log import logger
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--corpus_file", type=str, required=True, help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str, required=True, help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='recall_result', help="The full path of recall result file to save")
parser.add_argument("--recall_result_file", type=str, default='recall_result_file', help="The file name of recall result")
parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=None, type=int, help="output_embedding_size")
parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
model_name_or_path = "rocketqa-zh-dureader-query-encoder"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
),
):
return [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
model = paddle.DataParallel(model)
# Load pretrained semantic model
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
logger.info("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
# Need better way to get inner model of DataParallel
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
# print(text_list[:5])
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(
query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
if not os.path.exists(args.recall_result_dir):
os.mkdir(args.recall_result_dir)
recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
with open(recall_result_file, "w", encoding="utf-8") as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = args.batch_size * batch_index + row_index
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write(
"{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
)
)
pymilvus==1.1.1
pandas==0.25.1
paddlenlp>=2.3.7
hnswlib>=0.5.2
pybind11
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment