Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=UTF-8
import numpy as np
import hnswlib
from paddlenlp.utils.log import logger
def build_index(args, data_loader, model):
index = hnswlib.Index(space="ip", dim=args.output_emb_size)
# Initializing index
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
index.init_index(max_elements=args.hnsw_max_elements, ef_construction=args.hnsw_ef, M=args.hnsw_m)
# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
index.set_ef(args.hnsw_ef)
# Set number of threads used during batch search/construction
# By default using all available cores
index.set_num_threads(16)
logger.info("start build index..........")
all_embeddings = []
for text_embeddings in model.get_semantic_embedding(data_loader):
all_embeddings.append(text_embeddings.numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
index.add_items(all_embeddings)
logger.info("Total index number:{}".format(index.get_current_count()))
return index
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == "train" else False
if mode == "train":
batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
def convert_example_test(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
if "label" in key:
# do_evaluate
result += [example["label"]]
else:
# do_train
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
def gen_text_file(similar_text_pair_file):
text2similar_text = {}
texts = []
with open(similar_text_pair_file, "r", encoding="utf-8") as f:
for line in f:
splited_line = line.rstrip().split("\t")
if len(splited_line) != 2:
continue
text, similar_text = line.rstrip().split("\t")
if not text or not similar_text:
continue
text2similar_text[text] = similar_text
texts.append({"text": text})
return texts, text2similar_text
def read_simcse_text(data_path):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip()
yield {"text_a": data, "text_b": data}
def read_text_pair(data_path, is_test=False):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip().split("\t")
if is_test is False:
if len(data) != 3:
continue
yield {"text_a": data[0], "text_b": data[1], "label": data[2]}
else:
if len(data) != 2:
continue
yield {"text_a": data[0], "text_b": data[1]}
python predict.py --model_dir=../../output
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import paddle
from paddle import inference
from scipy import spatial
from paddlenlp.data import Pad, Tuple
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.utils.log import logger
sys.path.append(".")
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=15, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="model name.")
parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
parser.add_argument("--benchmark", type=eval, default=False, help="To log some information about environment and running.")
parser.add_argument("--save_log_path", type=str, default="./log_output/", help="The file path to save log.")
args = parser.parse_args()
# yapf: enable
def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class Predictor(object):
def __init__(
self,
model_dir,
device="gpu",
max_seq_length=128,
batch_size=32,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False,
):
self.max_seq_length = max_seq_length
self.batch_size = batch_size
model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as initialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8,
}
precision_mode = precision_map[precision]
if args.use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(args.cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
if args.benchmark:
import auto_log
pid = os.getpid()
self.autolog = auto_log.AutoLogger(
model_name=args.model_name_or_path,
model_precision=precision,
batch_size=self.batch_size,
data_shape="dynamic",
save_path=args.save_log_path,
inference_config=config,
pids=pid,
process_name=None,
gpu_ids=0,
time_keys=["preprocess_time", "inference_time", "postprocess_time"],
warmup=0,
logger=logger,
)
def extract_embedding(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the feature vectors.
"""
if args.benchmark:
self.autolog.times.start()
examples = []
for text in data:
input_ids, segment_ids = convert_example(text, tokenizer)
examples.append((input_ids, segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
): fn(samples)
if args.benchmark:
self.autolog.times.stamp()
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
if args.benchmark:
self.autolog.times.stamp()
if args.benchmark:
self.autolog.times.end(stamp=True)
return logits
def predict(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the prediction probs.
"""
if args.benchmark:
self.autolog.times.start()
examples = []
for idx, text in enumerate(data):
input_ids, segment_ids = convert_example({idx: text[0]}, tokenizer)
title_ids, title_segment_ids = convert_example({idx: text[1]}, tokenizer)
examples.append((input_ids, segment_ids, title_ids, title_segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
): fn(samples)
if args.benchmark:
self.autolog.times.stamp()
query_ids, query_segment_ids, title_ids, title_segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(query_ids)
self.input_handles[1].copy_from_cpu(query_segment_ids)
self.predictor.run()
query_logits = self.output_handle.copy_to_cpu()
self.input_handles[0].copy_from_cpu(title_ids)
self.input_handles[1].copy_from_cpu(title_segment_ids)
self.predictor.run()
title_logits = self.output_handle.copy_to_cpu()
if args.benchmark:
self.autolog.times.stamp()
if args.benchmark:
self.autolog.times.end(stamp=True)
result = [float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
return result
if __name__ == "__main__":
# Define predictor to do prediction.
predictor = Predictor(
args.model_dir,
args.device,
args.max_seq_length,
args.batch_size,
args.use_tensorrt,
args.precision,
args.cpu_threads,
args.enable_mkldnn,
)
# ErnieTinyTokenizer is special for ernie-tiny pretained model.
output_emb_size = 256
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
res = predictor.extract_embedding(corpus_list, tokenizer)
print(res.shape)
print(res)
corpus_list = [["中西方语言与文化的差异", "中西方文化差异以及语言体现中西方文化,差异,语言体现"], ["中西方语言与文化的差异", "飞桨致力于让深度学习技术的创新与应用更简单"]]
res = predictor.predict(corpus_list, tokenizer)
print(res)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--similar_text_pair", type=str, default='', help="The full path of similat pair file")
parser.add_argument("--recall_result_file", type=str, default='', help="The full path of recall result file")
parser.add_argument("--recall_num", type=int, default=10, help="Most similair number of doc recalled from corpus per query")
args = parser.parse_args()
# yapf: enable
def recall(rs, N=10):
"""
Ratio of recalled Ground Truth at topN Recalled Docs
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
>>> recall(rs, N=1)
0.333333
>>> recall(rs, N=2)
>>> 0.6666667
>>> recall(rs, N=3)
>>> 1.0
Args:
rs: Iterator of recalled flag()
Returns:
Recall@N
"""
recall_flags = [np.sum(r[0:N]) for r in rs]
return np.mean(recall_flags)
if __name__ == "__main__":
text2similar = {}
with open(args.similar_text_pair, "r", encoding="utf-8") as f:
for line in f:
text, similar_text = line.rstrip().split("\t")
text2similar[text] = similar_text
rs = []
with open(args.recall_result_file, "r", encoding="utf-8") as f:
relevance_labels = []
for index, line in enumerate(f):
if index % args.recall_num == 0 and index != 0:
rs.append(relevance_labels)
relevance_labels = []
text, recalled_text, cosine_sim = line.rstrip().split("\t")
if text2similar[text] == recalled_text:
relevance_labels.append(1)
else:
relevance_labels.append(0)
recall_N = []
recall_num = [1, 5, 10, 20, 50]
result = open("result.tsv", "a")
res = []
for topN in recall_num:
R = round(100 * recall(rs, N=topN), 3)
recall_N.append(str(R))
for key, val in zip(recall_num, recall_N):
print("recall@{}={}".format(key, val))
res.append(str(val))
result.write("\t".join(res) + "\n")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from model import SimCSE
from paddlenlp.transformers import AutoModel, AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-base-query-encoder', type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
output_emb_size = 256
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # segment_ids
],
)
# Save in static graph model.
save_path = os.path.join(args.output_path, "inference")
paddle.jit.save(model, save_path)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from functools import partial
import paddle
from data import create_dataloader
from model import SimCSE
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
if __name__ == "__main__":
device = "gpu"
max_seq_length = 64
output_emb_size = 256
batch_size = 1
params_path = "checkpoints/model_20000/model_state.pdparams"
id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
model_name_or_path = "rocketqa-zh-base-query-encoder"
paddle.set_device(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=output_emb_size)
# Load pretrained semantic model
if params_path and os.path.isfile(params_path):
state_dict = paddle.load(params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
all_embeddings = []
model.eval()
with paddle.no_grad():
for batch_data in corpus_data_loader:
input_ids, token_type_ids = batch_data
text_embeddings = model.get_pooled_embedding(input_ids, token_type_ids)
all_embeddings.append(text_embeddings)
text_embedding = all_embeddings[0]
print(text_embedding.shape)
print(text_embedding.numpy())
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class SimCSE(nn.Layer):
def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off between
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(
self.ptm.config.hidden_size, output_emb_size, weight_attr=weight_attr
)
self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
@paddle.jit.to_static(
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
]
)
def get_pooled_embedding(
self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, with_pooler=True
):
# Note: cls_embedding is poolerd embedding with act tanh
sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if with_pooler is False:
cls_embedding = sequence_output[:, 0, :]
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
with_pooler=True,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask, with_pooler=with_pooler
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask, with_pooler=with_pooler
)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
return cosine_sim
def forward(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
)
cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
# substract margin from all positive samples cosine_sim()
margin_diag = paddle.full(
shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
)
cosine_sim = cosine_sim - paddle.diag(margin_diag)
# scale cosine to ease training converge
cosine_sim *= self.sacle
labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
loss = F.cross_entropy(input=cosine_sim, label=labels)
return loss
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from functools import partial
import numpy as np
import paddle
from data import convert_example, create_dataloader, read_text_pair
from model import SimCSE
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--text_pair_file", type=str, required=True, help="The full path of input file")
parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--margin", default=0.0, type=float, help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--output_emb_size", default=0, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-base-query-encoder', type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable
def predict(model, data_loader):
"""
Predicts the data labels.
Args:
model (obj:`SimCSE`): A model to extract text embedding or calculate similarity of text pair.
data_loader (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
Returns:
results(obj:`List`): cosine similarity of text pairs.
"""
cosine_sims = []
model.eval()
with paddle.no_grad():
for batch_data in data_loader:
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch_data
batch_cosine_sim = model.cosine_sim(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids,
).numpy()
cosine_sims.append(batch_cosine_sim)
cosine_sims = np.concatenate(cosine_sims, axis=0)
return cosine_sims
if __name__ == "__main__":
paddle.set_device(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # title_segment
): [data for data in fn(samples)]
valid_ds = load_dataset(read_text_pair, data_path=args.text_pair_file, lazy=False, is_test=True)
valid_data_loader = create_dataloader(
valid_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SimCSE(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
cosin_sim = predict(model, valid_data_loader)
for idx, cosine in enumerate(cosin_sim):
print("{}".format(cosine))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=UTF-8
import argparse
import os
from functools import partial
import paddle
from ann_util import build_index
from data import convert_example_test, create_dataloader, gen_id2corpus, gen_text_file
from model import SimCSE
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
from paddlenlp.utils.log import logger
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--corpus_file", type=str, required=True, help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str, required=True, help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='recall_result', help="The full path of recall result file to save")
parser.add_argument("--recall_result_file", type=str, default='recall_result_file', help="The file name of recall result")
parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=None, type=int, help="output_embedding_size")
parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-base-query-encoder', type=str, help='The pretrained model used for training')
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
model = paddle.DataParallel(model)
# Load pretrained semantic model
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
logger.info("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
# Need better way to get inner model of DataParallel
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(
query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
if not os.path.exists(args.recall_result_dir):
os.mkdir(args.recall_result_dir)
recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
with open(recall_result_file, "w", encoding="utf-8") as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = args.batch_size * batch_index + row_index
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write(
"{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
)
)
python -u evaluate.py \
--similar_text_pair "recall/dev.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 50
\ No newline at end of file
python export_model.py --params_path checkpoints/model_12000/model_state.pdparams \
--model_name_or_path rocketqa-zh-base-query-encoder \
--output_path=./output
\ No newline at end of file
# gpu
root_dir="checkpoints"
python -u -m paddle.distributed.launch --gpus "3" \
predict.py \
--device gpu \
--params_path "${root_dir}/model_12000/model_state.pdparams" \
--output_emb_size 256 \
--batch_size 128 \
--max_seq_length 64 \
--model_name_or_path rocketqa-zh-base-query-encoder \
--text_pair_file "recall/test.csv"
# cpu
# root_dir="checkpoints"
# python predict.py \
# --device cpu \
# --params_path "${root_dir}/model_20000/model_state.pdparams" \
# --output_emb_size 256 \
# --batch_size 128 \
# --max_seq_length 64 \
# --text_pair_file "recall/test.csv"
\ No newline at end of file
# gpu
python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "checkpoints/model_12000/model_state.pdparams" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 60 \
--recall_num 50 \
--similar_text_pair "recall/dev.csv" \
--corpus_file "recall/corpus.csv"
# cpu
# python recall.py \
# --device cpu \
# --recall_result_dir "recall_result_dir" \
# --recall_result_file "recall_result.txt" \
# --params_path "checkpoints/model_20000/model_state.pdparams" \
# --hnsw_m 100 \
# --hnsw_ef 100 \
# --batch_size 64 \
# --output_emb_size 256\
# --max_seq_length 60 \
# --recall_num 50 \
# --similar_text_pair "recall/dev.csv" \
# --corpus_file "recall/corpus.csv"
\ No newline at end of file
# simcse gpu
python -u -m paddle.distributed.launch --gpus '1,2,3,4' \
train.py \
--device gpu \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 2000 \
--eval_steps 100 \
--max_seq_length 64 \
--infer_with_fc_pooler \
--dropout 0.2 \
--output_emb_size 256 \
--train_set_file "./recall/train_unsupervised.csv" \
--test_set_file "./recall/dev.csv" \
--model_name_or_path "rocketqa-zh-base-query-encoder"
# simcse cpu
# python train.py \
# --device cpu \
# --save_dir ./checkpoints/ \
# --batch_size 64 \
# --learning_rate 5E-5 \
# --epochs 3 \
# --save_steps 2000 \
# --eval_steps 100 \
# --max_seq_length 64 \
# --infer_with_fc_pooler \
# --dropout 0.2 \
# --output_emb_size 256 \
# --train_set_file "./recall/train_unsupervised.csv" \
# --test_set_file "./recall/dev.csv"
# --model_name_or_path "ernie-3.0-medium-zh"
# post training + simcse
# python -u -m paddle.distributed.launch --gpus '0,1,2,3' \
# train.py \
# --device gpu \
# --save_dir ./checkpoints/ \
# --batch_size 64 \
# --learning_rate 5E-5 \
# --epochs 3 \
# --save_steps 2000 \
# --eval_steps 100 \
# --max_seq_length 64 \
# --infer_with_fc_pooler \
# --dropout 0.2 \
# --output_emb_size 256 \
# --train_set_file "./recall/train_unsupervised.csv" \
# --test_set_file "./recall/dev.csv"
# --model_name_or_path "post_ernie"
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import random
import time
from functools import partial
import numpy as np
import paddle
from data import convert_example, create_dataloader, read_simcse_text
from model import SimCSE
from visualdl import LogWriter
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer, LinearDecayWithWarmup
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=0, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--epochs", default=1, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--save_steps', type=int, default=10000, help="Step interval for saving checkpoint.")
parser.add_argument('--eval_steps', type=int, default=10000, help="Step interval for evaluation.")
parser.add_argument("--train_set_file", type=str, required=True, help="The full path of train_set_file.")
parser.add_argument("--test_set_file", type=str, required=True, help="The full path of test_set_file.")
parser.add_argument("--margin", default=0.0, type=float, help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale", default=20, type=int, help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--dropout", default=0.1, type=float, help="Dropout for pretrained model encoder.")
parser.add_argument("--infer_with_fc_pooler", action='store_true', help="Whether use fc layer after cls embedding or not for when infer.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-base-query-encoder', type=str, help='The pretrained model used for training')
args = parser.parse_args()
# fmt: on
def set_seed(seed):
"""sets random seed"""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def do_train():
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
writer = LogWriter(logdir="./log/scalar_test/train")
train_ds = load_dataset(read_simcse_text, data_path=args.train_set_file, lazy=False)
pretrained_model = AutoModel.from_pretrained(
args.model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout
)
print("loading model from {}".format(args.model_name_or_path))
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
model = SimCSE(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size)
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)
print("warmup from:{}".format(args.init_from_ckpt))
model = paddle.DataParallel(model)
num_training_steps = len(train_data_loader) * args.epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params,
)
time_start = time.time()
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
loss = model(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids,
)
global_step += 1
if global_step % 10 == 0 and rank == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss, 10 / (time.time() - tic_train))
)
writer.add_scalar(tag="loss", step=global_step, value=loss)
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
if global_step % args.save_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % (global_step))
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, "model_state.pdparams")
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
time_end = time.time()
print("totally cost", time_end - time_start)
if __name__ == "__main__":
do_train()
pymilvus>=2.1.0
pandas
paddlenlp>=2.1.1
paddlepaddle-gpu>=2.2.3
hnswlib>=0.5.2
numpy>=1.17.2
visualdl>=2.2.2
paddle-serving-app>=0.7.0
paddle-serving-client>=0.7.0
paddle-serving-server-gpu>=0.7.0.post102
pybind11
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import numpy as np
import pandas as pd
from paddle_serving_server.pipeline import PipelineClient
sys.path.append("./recall/milvus") # noqa: E402
from config import collection_name, embedding_name, partition_tag # noqa: E402
from milvus_util import RecallByMilvus # noqa: E402
def recall_result(list_data):
client = PipelineClient()
client.connect(["127.0.0.1:8080"])
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("Extract feature time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
return result
def search_in_milvus(embeddings, query_text):
recall_client = RecallByMilvus()
start_time = time.time()
results = recall_client.search(
embeddings, embedding_name, collection_name, partition_names=[partition_tag], output_fields=["pk", "text"]
)
end_time = time.time()
print("Search milvus time cost is {} seconds ".format(end_time - start_time))
list_data = []
for line in results:
for item in line:
# idx = item.id
distance = item.distance
text = item.entity.get("text")
list_data.append([query_text, text, distance])
df = pd.DataFrame(list_data, columns=["query_text", "text", "distance"])
df.to_csv("recall_result.csv", index=False)
return df
def rerank(df):
client = PipelineClient()
client.connect(["127.0.0.1:8089"])
list_data = []
for index, row in df.iterrows():
example = {"query": row["query_text"], "title": row["text"]}
list_data.append(example)
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = str(item)
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
df["distance"] = result
df = df.sort_values(by=["distance"], ascending=False)
df.to_csv("rank_result.csv", index=False)
if __name__ == "__main__":
list_data = ["中西方语言与文化的差异"]
result = recall_result(list_data)
df = search_in_milvus(result, list_data[0])
rerank(df)
# 问答系统
问答系统(Question Answering System, QA)是信息检索系统的一种高级形式,它能用准确、简洁的自然语言回答用户用自然语言提出的问题。问答系统的应用空间十分包括,包括搜索引擎,小度音响等智能硬件,聊天机器人,以及政府、金融、银行、电信、电商领域的智能客服等。
在问答系统中,检索式问答系统是最容易落地的一种,它具有速度快、可控性好、容易拓展等特点。
检索式问答系统是一种基于问题答案对进行检索匹配的系统,根据是否需要FAQ(Frequently asked questions)可以进一步分为有监督检索式问答系统和无监督检索式问答系统,前者需要用户提供FAQ语料,后者不需要预备问答语料,可通过问题答案对生成的方式自动生成语料。
PaddleNLP提供了[有监督检索式问答系统](./supervised_qa)[无监督检索式问答系统](./unsupervised_qa),开发者可根据实际情况进行选择。
关于问答场景应用案例请查阅飞桨新产品[RocketQA](https://github.com/PaddlePaddle/RocketQA)
**有监督检索式问答系统效果展示**
<div align="center">
<img src="https://user-images.githubusercontent.com/12107462/190298926-a1fc92f3-5ec7-4265-8357-ab860cc1fed2.gif" width=800>
</div>
**无监督检索式问答系统效果展示**
<div align="center">
<img src="https://user-images.githubusercontent.com/20476674/199488926-c64d3f4e-8117-475f-afe6-b02088105d09.gif">
</div>
# 保险智能问答
**目录**
* [1. 项目介绍](#项目介绍)
* [2. 系统特色](#系统特色)
* [3. 保险智能问答系统方案](#保险问答系统方案)
* [4. 动手实践——搭建自己的端到端检索式问答系统](#动手实践——搭建自己的端到端检索式问答系统)
* [5. 模型优化](#模型优化)
* [6. 参考文献](#参考文献)
<a name="项目介绍"></a>
## 1. 项目介绍
智能问答是获取信息和知识的更直接、更高效的方式之一,传统的信息检索方法智能找到相关的文档,而智能问答能够直接找到精准的答案,极大的节省了人们查询信息的时间。问答按照技术分为基于阅读理解的问答和检索式的问答,阅读理解的问答是在正文中找到对应的答案片段,检索式问答则是匹配高频的问题,然后把答案返回给用户。本项目属于检索式的问答,问答的领域用途很广,比如搜索引擎,小度音响等智能硬件,政府,金融,银行,电信,电商领域的智能客服,聊天机器人等。
- 本方案是场景的定制化的方案,用户可以使用自己的数据训练一个特定场景的方案。另外,想快速体验FAQ智能问答系统请参考Pipelines的实现[FAQ智能问答](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/pipelines/examples/FAQ)
- 本项目的详细教程请参考(包括数据和代码实现)[aistudio教程](https://aistudio.baidu.com/aistudio/projectdetail/3882519)
<a name="系统特色"></a>
## 2. 系统特色
+ 低门槛
+ 手把手搭建检索式保险智能问答
+ 无需相似 Query-Query Pair 标注数据也能构建保险智能问答
+ 效果好
+ 业界领先的检索预训练模型: RocketQA Dual Encoder
+ 针对无标注数据场景的领先解决方案: 检索预训练模型 + 增强的无监督语义索引微调
+ 性能快
+ 基于 Paddle Inference 快速抽取向量
+ 基于 Milvus 快速查询和高性能建库
+ 基于 Paddle Serving 高性能部署
<a name="保险问答系统方案"></a>
## 3. 保险智能问答系统方案
### 3.1 技术方案和评估指标
#### 3.1.1 技术方案
**语义索引**:针对保险等金融领域的问答只有问答对的场景,我们提供了一个在SimCSE的基础上融合WR (word reptition)策略,同义词策略,R-Drop策略的无监督的解决方案。
#### 3.1.2 评估指标
* 该保险智能问答系统使用的指标是 Recall@K,表示的是预测的前topK(从最后的按得分排序的召回列表中返回前K个结果)结果和语料库中真实的前 K 个相关结果的重叠率,衡量的是检索系统的查全率。
### 3.2 数据说明
#### 3.2.1 预置数据介绍
数据集来源于Github开源的保险的问答数据,包括源用户的问题和相应的回复。
| 阶段 |模型 | 训练集 | 评估集(用于评估模型效果) | 召回库 |
| ------------ | ------------ |------------ | ------------ | ------------ |
| 召回 | SimCSE | 3030 | 758 | 3788 |
其中训练集的问题-问题对的构造使用了同义词替换的方法,详情请参考[nlpcda](https://github.com/425776024/nlpcda)
评估集的问题对的构造使用了中英文回译的方法,数据使用的是百度翻译的API,详情请参考[百度翻译](https://fanyi-api.baidu.com/?fr=simultaneous)
【注意】:数据集是基于Github开源数据进行了处理得到的,如果有任何侵权问题,请及时联系,我们会第一时间进行删除。
```
├── data # 数据集
├── train.csv # 无监督训练集
├── train_aug.csv # 同义词替换后构造的训练集
├── test_pair.csv # 测试集,用于评估模型的效果
├── corpus.csv # 构建召回的数据,用于评估模型的召回效果
├── qa_pair.csv # 问答对,问题对应的答案
```
数据集的下载链接为: [faq_finance](https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/baoxianzhidao/intro.ipynb)
#### 3.2.2 数据格式
训练需要规定格式的本地数据集,需要准备训练集文件`train.csv`或者`train_aug.csv`,测试集`test_pair.csv`,召回集文件`corpus.csv`,问答对 `qa_pair.csv`
用于无监督训练的训练集的格式如下:
```
文本1
文本2
...
```
训练集合`train.csv`的文件样例:
```
家里有社保,还有必要买重疾险吗?
工地买了建工险,出了事故多长时间上报保险公司有效
请问下哆啦a保值不值得买呢?不晓得保障多不多
自由职业办理养老保险是否划算
工伤七级如果公司不干了,怎么赔我
普通意外险的保障范围都有哪些?
......
```
除此之外,也可以使用数据增强的格式,训练方式是类似有监督的构造句子对。数据增强的文件格式如下:
```
文本1 \t 增强文本1
文本2 \t 增强文本2
```
增强数据集`train_aug.csv`的格式如下:
```
工伤七级如果公司不干了,怎么赔我 工伤七级如果企业不干了,怎生赔我
普通意外险的保障范围都有哪些? 一般性意外险的保障范围都有哪些?
重疾险赔付三次和赔付一次的区别 重疾险赔偿三次和赔偿一次的区别
。。。。。
```
测试集合`test_pair.csv`是问句对,具体格式如下:
```
句子1 \t 句子2
句子3 \t 句子4
```
其中句子1和句子2是相似的句子,只是表达方式不同,或者进行了一定程度的变形,但实际表达的语义是一样的。
测试集的文件样例:
```
车险如何计算 如何计算汽车保险
农民买养老保险怎么买 农民如何购买养老保险
车险必买哪几项 你必须购买哪些汽车保险
...
```
召回集合`corpus.csv`主要作用是检验测试集合的句子对能否被正确召回,它的构造主要是提取测试集的第二列的句子,然后加入很多无关的句子,用来检验模型能够正确的从这些文本中找出测试集合对应的第二列的句子,格式如下:
```
如何办理企业养老保险
如何为西班牙购买签证保险?
康慧宝需要买多少?
如果另一方对车辆事故负有全部责任,并且拒绝提前支付维修费,该怎么办
准备清明节去新兴坡旅游,什么样的旅游保险好?
你能从国外账户购买互助基金吗?
什么是海上保险?有哪些海上保险?
....
```
问答对集合`qa_pair.csv`包含的是整个项目的问题和对应的答案,,具体格式如下:
```
问题1 \t 答案1
问题2 \t 答案2
......
```
问答对集合示例:
```
既然强制运输保险有浮动费率制度,有商业保险吗? 商业车险也有的。关于汽车商业险的费率在全国每个省都是不一样的,在同一地区,费率也会变化。一般1年、2-4年、4-6年、费率都不同。新车第一年的费率会比较高,2-4是相对比较优惠,4-6会再上涨,不同类型的汽车费率也不同。商业车险保费浮动比例与其他公司相比都是差不多的,一般销售保费浮动比例是这样的:上年赔款1次,保费打7折;上年赔款2次,保费打8折;上年赔款3次,保费上浮15%;上年赔款4次,保费上浮51%;上年赔款5次以上,保费上浮69%。该公司的有关人士表示,如果上年赔款次数超过了7次,续保时可能会遭拒。目前的研究意见规定中加大了车险保费与赔款记录相关系数的浮动区间,并与交通违章情况挂钩,若车主少违章少出险则保费最多可打5折,反之则保费最高可上浮至现行标准的4.5倍。
汇鑫安儿童保险的保费是否也与性别有关 有关系,女宝宝会比男宝宝要多一点。如0岁男宝宝趸交是130.4元,3年期交是43.7元,5年期交是27元;而0岁女宝宝趸交是131.6元,3年期交是44.1元,5年期交是27.2元。
在中国,哪个品牌的餐饮照明比较好? 一般来说美尔家比较可靠吧,有保障
......
```
### 3.3 代码说明
```
|—— data.py # 数据读取、数据转换等预处理逻辑
|—— model.py # SimCSE模型
|—— train.py # SimCSE训练主脚本
|—— ann_util.py # Ann 建索引库相关函数
|—— config.py # Milvus 配置文件
|—— evaluate.py # 召回评估文件
|—— recall.py # 基于训练好的语义索引模型,从召回库中召回给定文本的相似文本
|—— export_model.py # 动态图转换成静态图
|—— export_to_serving.py # 静态图转 Serving
|—— feature_extract.py # 批量提取文本的特征向量
|—— milvus_util.py # Milvus的插入和召回类
|—— milvus_ann_search.py # 向 Milvus 引擎插入向量的函数
|—— run_system.py # Client Server 模式客户端,向 server 发送文本,得到向量后,利用milvus引擎进行检索
|—— scripts
|—— export_model.sh # 动态图转换成静态图脚本
|—— evaluate.sh # 评估 bash 版本
|—— run_build_index.sh # 构建索引 bash 版本
|—— train.sh # 训练 bash 版本
|—— feature_extract.sh # 向量抽取 bash 版本
|—— export_to_serving.sh # Paddle Inference 转 Serving 的 bash 脚本
|—— deploy
|—— python
|—— rpc_client.py # Paddle Serving 的 Client 端
|—— web_service.py # Paddle Serving 的 Serving 端
|—— config_nlp.yml # Paddle Serving 的配置文件
```
### 3.3 效果评估
以下实验结果使用的是模型是`rocketqa-zh-dureader-query-encoder`
| 模型 | Recall@1 |Recall@5 |Recall@10 |
| ------------ | ------------ |--------- |--------- |
| RocketQA + SimCSE | 82.827 | 93.791| 96.169|
| RocketQA + SimCSE + WR | 82.695 | 93.791| 96.301|
| RocketQA + SimCSE + WR + 同义词 | 85.205 | 93.923| 95.509|
| RocketQA + SimCSE + 同义词 + RDrop | **85.469** | **94.716**| **96.433**|
<a name="动手实践——搭建自己的端到端检索式问答系统"></a>
## 4. 动手实践——搭建自己的端到端检索式问答系统
### 4.1 环境安装
在运行下面的代码之前,安装相关的依赖,运行下面的命令:
```
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
```
### 4.2 模型训练
SimCSE可以使用2种方式进行训练,即有监督训练和无监督训练,区别在于无监督训练不需要标注数据集,有监督训练需要标注好问句对,下面是无监督的执行方式。
#### 无监督训练
无监督训练执行下面的方式,可以选择`train.csv`,纯无监督文本,或者数据增强的数据`train_aug.csv`,然后执行下面的命令:
```
python -u -m paddle.distributed.launch --gpus='0' \
train.py \
--device gpu \
--model_name_or_path rocketqa-zh-base-query-encoder \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 50 \
--eval_steps 50 \
--max_seq_length 64 \
--dropout 0.2 \
--output_emb_size 256 \
--dup_rate 0.1 \
--rdrop_coef 0.1 \
--train_set_file "./data/train_aug.csv"
```
参数含义说明
* `device`: 使用 cpu/gpu 进行训练
* `save_dir`: 模型存储路径
* `model_name_or_path`: 预训练语言模型名,用于模型的初始化
* `batch_size`: 训练的batch size的大小
* `learning_rate`: 训练的学习率的大小
* `epochs`: 训练的epoch数
* `is_unsupervised`:是否使用无监督的训练方式
* `save_steps`: 模型存储 checkpoint 的间隔 steps 个数
* `max_seq_length`: 输入序列的最大长度
* `dropout`: SimCSE的dropout参数
* `output_emb_size`: Transformer 顶层输出的文本向量维度
* `dup_rate` : SimCSE的 Word reptition 策略的重复率
* `train_set_file`: 训练集文件
* `rdrop_coef`: R-Drop的系数
也可以使用下面的bash脚本:
```
sh scripts/train.sh
```
### 4.3 评估
效果评估分为 4 个步骤:
a. 获取Doc端Embedding
基于语义索引模型抽取出Doc样本库的文本向量。
b. 采用hnswlib对Doc端Embedding建库
使用 ANN 引擎构建索引库(这里基于 [hnswlib](https://github.com/nmslib/hnswlib) 进行 ANN 索引)
c. 获取question的Embedding并查询相似结果
基于语义索引模型抽取出评估集 *Source Text* 的文本向量,在第 2 步中建立的索引库中进行 ANN 查询,召回 Top10 最相似的 *Target Text*, 产出评估集中 *Source Text* 的召回结果 `recall_result` 文件。
d. 评估
基于评估集 `test.csv` 和召回结果 `recall_result` 计算评估指标 Recall@k,其中k取值1,5,10。
运行如下命令进行 ANN 建库、召回,产出召回结果数据 `recall_result`
```
python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "checkpoints/model_100/model_state.pdparams" \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 64 \
--recall_num 10 \
--similar_text_pair "data/test_pair.csv" \
--corpus_file "data/corpus.csv"
```
参数含义说明
* `device`: 使用 cpu/gpu 进行训练
* `recall_result_dir`: 召回结果存储目录
* `recall_result_file`: 召回结果的文件名
* `model_name_or_path`: 预训练语言模型名,用于模型的初始化
* `params_path`: 待评估模型的参数文件名
* `hnsw_m`: hnsw 算法相关参数,保持默认即可
* `hnsw_ef`: hnsw 算法相关参数,保持默认即可
* `output_emb_size`: Transformer 顶层输出的文本向量维度
* `recall_num`: 对 1 个文本召回的相似文本数量
* `similar_text_pair`: 由相似文本对构成的评估集
* `corpus_file`: 召回库数据 corpus_file
也可以使用下面的bash脚本:
```
sh scripts/run_build_index.sh
```
run_build_index.sh还包含cpu和gpu运行的脚本,默认是gpu的脚本
接下来,运行如下命令进行效果评估,产出Recall@1, Recall@5, Recall@10 指标:
```
python -u evaluate.py \
--similar_text_pair "data/test_pair.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 10
```
也可以使用下面的bash脚本:
```
sh scripts/evaluate.sh
```
输出如下的结果:
```
recall@1=84.941
recall@5=94.452
recall@10=96.433
```
参数含义说明
* `similar_text_pair`: 由相似文本对构成的评估集 semantic_similar_pair.tsv
* `recall_result_file`: 针对评估集中第一列文本 *Source Text* 的召回结果
* `recall_num`: 对 1 个文本召回的相似文本数量
### 4.4 模型部署
模型部署模块首先要把动态图转换成静态图,然后转换成serving的格式。
#### 动转静导出
首先把动态图模型转换为静态图:
```
python export_model.py --params_path checkpoints/model_100/model_state.pdparams \
--output_path=./output \
--model_name_or_path rocketqa-zh-base-query-encoder
```
也可以运行下面的bash脚本:
```
sh scripts/export_model.sh
```
#### 问答检索引擎
模型准备结束以后,开始搭建 Milvus 的语义检索引擎,用于语义向量的快速检索,本项目使用[Milvus](https://milvus.io/)开源工具进行向量检索,Milvus 的搭建教程请参考官方教程 [Milvus官方安装教程](https://milvus.io/docs/v2.1.x/install_standalone-docker.md)本案例使用的是 Milvus 的2.1 版本,建议使用官方的 Docker-Compose 安装方式,简单快捷。
Milvus 搭建完系统以后就可以插入和检索向量了,首先生成 embedding 向量,每个样本生成256维度的向量:
```
python feature_extract.py \
--model_dir=./output \
--model_name_or_path rocketqa-zh-base-query-encoder \
--corpus_file "data/corpus.csv"
```
其中 output 目录下存放的是召回的 Paddle Inference 静态图模型。
也可以运行下面的bash脚本:
```
sh scripts/feature_extract.sh
```
然后向搭建好的 Milvus 系统插入向量:
```
python milvus_ann_search.py --data_path data/qa_pair.csv \
--embedding_path corpus_embedding.npy \
--batch_size 100000 \
--insert
```
另外,Milvus提供了可视化的管理界面,可以很方便的查看数据,安装地址为[Attu](https://github.com/zilliztech/attu).
#### Paddle Serving 部署
Paddle Serving 的安装可以参考[Paddle Serving 安装文档](https://github.com/PaddlePaddle/Serving#installation)。需要在服务端和客户端安装相关的依赖,用pip安装Paddle Serving的依赖如下:
```
pip install paddle-serving-client==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddle-serving-app==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
# 如果是CPU部署,只需要安装CPU Server
pip install paddle-serving-server==0.8.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
# 如果是GPU Server,需要确认环境再选择执行哪一条,推荐使用CUDA 10.2的包
# CUDA10.2 + Cudnn7 + TensorRT6(推荐)
pip install paddle-serving-server-gpu==0.8.3.post102 -i https://pypi.tuna.tsinghua.edu.cn/simple
# CUDA10.1 + TensorRT6
pip install paddle-serving-server-gpu==0.8.3.post101 -i https://pypi.tuna.tsinghua.edu.cn/simple
# CUDA11.2 + TensorRT8
pip install paddle-serving-server-gpu==0.8.3.post112 -i https://pypi.tuna.tsinghua.edu.cn/simple
```
更详细的安装信息请参考[链接](https://github.com/PaddlePaddle/Serving/blob/v0.9.0/doc/Install_Linux_Env_CN.md),安装完依赖后就可以执行下面的步骤。首先把生成的静态图模型导出为 Paddle Serving的格式,命令如下:
```
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "./serving_server" \
--client_path "./serving_client" \
--fetch_alias_names "output_embedding"
```
参数含义说明
* `dirname`: 需要转换的模型文件存储路径,Program 结构文件和参数文件均保存在此目录。
* `model_filename`: 存储需要转换的模型 Inference Program 结构的文件名称。如果设置为 None ,则使用 `__model__` 作为默认的文件名
* `params_filename`: 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中,它才需要被指定。如果模型参数是存储在各自分离的文件中,设置它的值为 None
* `server_path`: 转换后的模型文件和配置文件的存储路径。默认值为 serving_server
* `client_path`: 转换后的客户端配置文件存储路径。默认值为 serving_client
* `fetch_alias_names`: 模型输出的别名设置,比如输入的 input_ids 等,都可以指定成其他名字,默认不指定
* `feed_alias_names`: 模型输入的别名设置,比如输出 pooled_out 等,都可以重新指定成其他模型,默认不指定
也可以运行下面的 bash 脚本:
```
sh scripts/export_to_serving.sh
```
启动 Pipeline Server:
```
cd deploy/python/
python web_service.py --model_name_or_path rocketqa-zh-base-query-encoder
```
启动客户端调用 Server, 使用 POST的方式:
向服务端发送 POST 请求示例:
```
curl -X POST -k http://localhost:8090/ernie/prediction -d '{"key": ["0"], "value": ["买了社保,是不是就不用买商业保险了?"]}'
```
也可以使用 rpc的方式:
首先修改rpc_client.py中需要预测的样本:
```
list_data = [
"买了社保,是不是就不用买商业保险了?",
]
```
然后运行:
```
python rpc_client.py
```
对于Windows用户,启动下面的Pipeline Server:
```
python web_service_windows.py --model_name_or_path rocketqa-zh-base-query-encoder
```
启动客户端调用 Server, 使用 POST的方式(Windows不支持RPC的调用方式),首先修改http_client.py中需要预测的样本:
```
data = {"feed": ["买了社保,是不是就不用买商业保险了?"], "fetch": ["output_embedding"]}
```
然后运行:
```
python http_client.py
```
### 4.5 问答系统整个流程
问答系统使用了Client Server的模式,即抽取向量的模型部署在服务端,然后启动客户端(Client)端去访问。
```
python run_system.py
```
代码内置的测试用例为:
```
list_data = ["买了社保,是不是就不用买商业保险了?"]
```
会输出如下的结果:
```
......
PipelineClient::predict pack_data time:1663127450.1656108
PipelineClient::predict before time:1663127450.166227
Extract feature time to cost :0.017495155334472656 seconds
=== start connecting to Milvus ===
=== Connect collection faq_finance ===
Search milvus time cost is 0.18691015243530273 seconds
如果你买社会保险,你不需要买商业保险吗? 社保是基础的,就是我们通常说的“五险”包括:基本养老保险、基本医疗保险、失业保险、工伤保险和生育保险。而商业保险则是保障。 0.32494643330574036
已有社会保险还需要买商业保险吗 社保是社会保险的简称社会保险是指国家为了预防和分担年老失业疾病以及死亡等社会风险实现社会安全而强制社会多数成员参加的具有所得重分配功能的非营利性的社会安全制度主要包括基本医疗保险基本养老保险工伤保险失业保险生育保险五大类险种,商业保险是社保的一个补充,如果有足够的经济条件可以进行购买。1、社保覆盖面广,不存在拒保问题,但是保障较低,只能满足基本的保障需求。社保中的医疗保险,住院一般可报70%。而且这70%的医疗费,限于扣除起付线标准后。而且,在社保规定用药和规定项目内。许多检查费、专家诊疗、高新尖诊疗技术,社保都是不报的。这就需配合必要的商业保险了。2、另外,社保医疗是出院后报的,商业医保中的重疾险是确诊后就可以给钱,可以弥补很多家庭没钱治的困境;3、商业保险可以选择购买更高的保额,社保则很有限;社保医疗只是补偿医药费,而没有住院期间的收入损失补偿,商业医疗就有住院补贴。总之,建议在有了社保后,再购买适合自己的寿险,加上意外险、住院医疗、重疾医疗保险,就是非常的完善的保障了。 0.38041722774505615
.....
```
输出的结果包括特征提取和检索的时间,还包含检索出来的问答对。
<a name="模型优化"></a>
## 5. 模型优化
### 5.1 有监督训练[优化步骤,可选]
无监督的方式对模型的提升有限,如果需要继续提升模型,则需要标注数据。构造类似`train_aug.csv`中的句子对,只需要构造相似句子对即可,不需要构造不相似的句子对。
```
python -u -m paddle.distributed.launch --gpus='0' \
train.py \
--device gpu \
--model_name_or_path rocketqa-zh-base-query-encoder \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 50 \
--eval_steps 50 \
--max_seq_length 64 \
--dropout 0.2 \
--output_emb_size 256 \
--dup_rate 0.1 \
--rdrop_coef 0.1 \
--train_set_file "./data/train_aug.csv"
```
其他步骤同上,只是使用的数据集是有监督数据。
## 6.参考文献
[1] Tianyu Gao, Xingcheng Yao, Danqi Chen: [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://arxiv.org/abs/2104.08821). EMNLP (1) 2021: 6894-6910
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment