Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hnswlib
import numpy as np
from paddlenlp.utils.log import logger
def build_index(args, data_loader, model):
index = hnswlib.Index(space="ip", dim=args.output_emb_size if args.output_emb_size > 0 else 768)
# Initializing index
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
index.init_index(max_elements=args.hnsw_max_elements, ef_construction=args.hnsw_ef, M=args.hnsw_m)
# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
index.set_ef(args.hnsw_ef)
# Set number of threads used during batch search/construction
# By default using all available cores
index.set_num_threads(16)
logger.info("start build index..........")
all_embeddings = []
for text_embeddings in model.get_semantic_embedding(data_loader):
all_embeddings.append(text_embeddings.numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
index.add_items(all_embeddings)
logger.info("Total index number:{}".format(index.get_current_count()))
return index
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
search_param = {"nprobe": 20}
collection_name = "faq_finance"
partition_tag = "partition_1"
MILVUS_HOST = "10.21.226.175"
MILVUS_PORT = 8530
data_dim = 256
top_k = 10
embedding_name = "embeddings"
index_config = {
"index_type": "IVF_FLAT",
"metric_type": "L2",
"params": {"nlist": 1000},
}
search_params = {
"metric_type": "L2",
"params": {"nprobe": top_k},
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
import paddle
def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == "train" else False
if mode == "train":
batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
if "label" in key:
# do_evaluate
result += [example["label"]]
else:
# do_train
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def convert_example_test(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def read_simcse_text(data_path):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip()
yield {"text_a": data, "text_b": data}
def read_text_pair(data_path, is_test=False):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip().split("\t")
if is_test is True:
if len(data) != 3:
continue
yield {"text_a": data[0], "text_b": data[1], "label": data[2]}
else:
if len(data) != 2:
continue
yield {"text_a": data[0], "text_b": data[1]}
def gen_text_file(similar_text_pair_file):
text2similar_text = {}
texts = []
with open(similar_text_pair_file, "r", encoding="utf-8") as f:
for line in f:
splited_line = line.rstrip().split("\t")
if len(splited_line) != 2:
continue
text, similar_text = line.rstrip().split("\t")
if not text or not similar_text:
continue
text2similar_text[text] = similar_text
texts.append({"text": text})
return texts, text2similar_text
def word_repetition(input_ids, token_type_ids, dup_rate=0.32):
"""Word Repetition strategy."""
input_ids = input_ids.numpy().tolist()
token_type_ids = token_type_ids.numpy().tolist()
batch_size, seq_len = len(input_ids), len(input_ids[0])
repetitied_input_ids = []
repetitied_token_type_ids = []
rep_seq_len = seq_len
for batch_id in range(batch_size):
cur_input_id = input_ids[batch_id]
actual_len = np.count_nonzero(cur_input_id)
dup_word_index = []
# If sequence length is less than 5, skip it
if actual_len > 5:
dup_len = random.randint(a=0, b=max(2, int(dup_rate * actual_len)))
# Skip cls and sep position
dup_word_index = random.sample(list(range(1, actual_len - 1)), k=dup_len)
r_input_id = []
r_token_type_id = []
for idx, word_id in enumerate(cur_input_id):
# Insert duplicate word
if idx in dup_word_index:
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
r_input_id.append(word_id)
r_token_type_id.append(token_type_ids[batch_id][idx])
after_dup_len = len(r_input_id)
repetitied_input_ids.append(r_input_id)
repetitied_token_type_ids.append(r_token_type_id)
if after_dup_len > rep_seq_len:
rep_seq_len = after_dup_len
# Padding the data to the same length
for batch_id in range(batch_size):
after_dup_len = len(repetitied_input_ids[batch_id])
pad_len = rep_seq_len - after_dup_len
repetitied_input_ids[batch_id] += [0] * pad_len
repetitied_token_type_ids[batch_id] += [0] * pad_len
return paddle.to_tensor(repetitied_input_ids, dtype="int64"), paddle.to_tensor(
repetitied_token_type_ids, dtype="int64"
)
# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程,每个进程内构建grpcSever和DAG
# 当build_dag_each_worker=False时,框架会设置主线程grpc线程池的max_workers=worker_num
worker_num: 20
# build_dag_each_worker, False,框架在进程内创建一条DAG;True,框架会每个进程内创建多个独立的DAG
build_dag_each_worker: false
dag:
# op资源类型, True, 为线程模型;False,为进程模型
is_thread_op: False
# 使用性能分析, True,生成Timeline性能数据,对性能有一定影响;False为不使用
tracer:
interval_s: 10
# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时,不自动生成http_port
http_port: 8090
# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时,会自动将rpc_port设置为http_port+1
rpc_port: 8080
op:
ernie:
# 并发数,is_thread_op=True时,为线程并发;否则为进程并发
concurrency: 1
# 当op配置没有server_endpoints时,从local_service_conf读取本地服务配置
local_service_conf:
# client类型,包括brpc, grpc和local_predictor.local_predictor不启动Serving服务,进程内预测
client_type: local_predictor
# ir_optim
ir_optim: True
# device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
device_type: 1
# 计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: '2'
# Fetch结果列表,以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
fetch_list: ['output_embedding']
# 模型路径
model_config: ../../serving_server/
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import requests
headers = {"Content-type": "application/json"}
url = "http://10.21.226.175:8080/ernie/prediction" # XXX取决于服务端YourService的初始化name参数
data = {"feed": ["买了社保,是不是就不用买商业保险了?"], "fetch": ["output_embedding"]}
data = json.dumps(data)
print(data)
r = requests.post(url=url, headers=headers, data=data)
print(r.json())
json_data = r.json()
data = np.array(json_data["result"]["output_embedding"])
print(data.shape)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
from paddle_serving_server.pipeline import PipelineClient
client = PipelineClient()
client.connect(["127.0.0.1:8080"])
list_data = ["买了社保,是不是就不用买商业保险了?"]
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
print(feed)
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
print(ret.key)
print(result.shape)
print(result)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from paddle_serving_server.web_service import Op, WebService
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select tokenizer name to for model")
args = parser.parse_args()
# yapf: enable
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
result = []
for text in example:
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class ErnieOp(Op):
def init_op(self):
from paddlenlp.transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
def preprocess(self, input_dicts, data_id, log_id):
from paddlenlp.data import Pad, Tuple
((_, input_dict),) = input_dicts.items()
print("input dict", input_dict)
batch_size = len(input_dict.keys())
examples = []
for i in range(batch_size):
input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer)
examples.append((input_ids, segment_ids))
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), # segment
),
):
return fn(samples)
input_ids, segment_ids = batchify_fn(examples)
feed_dict = {}
feed_dict["input_ids"] = input_ids
feed_dict["token_type_ids"] = segment_ids
return feed_dict, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
new_dict = {}
new_dict["output_embedding"] = str(fetch_dict["output_embedding"].tolist())
return new_dict, None, ""
class ErnieService(WebService):
def get_pipeline_response(self, read_op):
ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
return ernie_op
if __name__ == "__main__":
ernie_service = ErnieService(name="ernie")
ernie_service.prepare_pipeline_config("config_nlp.yml")
ernie_service.run_service()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from paddle_serving_server.web_service import WebService
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select tokenizer name to for model")
args = parser.parse_args()
# yapf: enable
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
result = []
for text in example:
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class ErnieService(WebService):
def init_service(self):
from paddlenlp.transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
def preprocess(self, feed=[], fetch=[]):
from paddlenlp.data import Pad, Tuple
print("input dict", feed)
batch_size = len(feed)
is_batch = True
examples = []
for i in range(batch_size):
input_ids, segment_ids = convert_example([feed[i]], self.tokenizer)
examples.append((input_ids, segment_ids))
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), # segment
),
):
return fn(samples)
input_ids, segment_ids = batchify_fn(examples)
feed_dict = {}
feed_dict["input_ids"] = input_ids
feed_dict["token_type_ids"] = segment_ids
return feed_dict, fetch, is_batch
def postprocess(self, feed=[], fetch=[], fetch_map=None):
for key in fetch_map:
fetch_map[key] = fetch_map[key].tolist()
return fetch_map
if __name__ == "__main__":
ernie_service = ErnieService(name="ernie")
ernie_service.load_model_config("../../serving_server")
ernie_service.prepare_server(workdir="workdir", port=8080)
ernie_service.init_service()
ernie_service.run_debugger_service()
ernie_service.run_web_service()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--similar_text_pair", type=str, default='', help="The full path of similat pair file")
parser.add_argument("--recall_result_file", type=str, default='', help="The full path of recall result file")
parser.add_argument("--recall_num", type=int, default=10, help="Most similair number of doc recalled from corpus per query")
args = parser.parse_args()
# yapf: enable
def recall(rs, N=10):
"""
Ratio of recalled Ground Truth at topN Recalled Docs
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
>>> recall(rs, N=1)
0.333333
>>> recall(rs, N=2)
>>> 0.6666667
>>> recall(rs, N=3)
>>> 1.0
Args:
rs: Iterator of recalled flag()
Returns:
Recall@N
"""
recall_flags = [np.sum(r[0:N]) for r in rs]
return np.mean(recall_flags)
if __name__ == "__main__":
text2similar = {}
with open(args.similar_text_pair, "r", encoding="utf-8") as f:
for line in f:
text, similar_text = line.rstrip().split("\t")
text2similar[text] = similar_text
rs = []
with open(args.recall_result_file, "r", encoding="utf-8") as f:
relevance_labels = []
for index, line in enumerate(f):
if index % args.recall_num == 0 and index != 0:
rs.append(relevance_labels)
relevance_labels = []
text, recalled_text, cosine_sim = line.rstrip().split("\t")
if text2similar[text] == recalled_text:
relevance_labels.append(1)
else:
relevance_labels.append(0)
recall_N = []
recall_num = [1, 5, 10]
result = open("result.tsv", "a")
res = []
for topN in recall_num:
R = round(100 * recall(rs, N=topN), 3)
recall_N.append(str(R))
for key, val in zip(recall_num, recall_N):
print("recall@{}={}".format(key, val))
res.append(str(val))
result.write("\t".join(res) + "\n")
# print("\t".join(recall_N))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from model import SimCSE
from paddlenlp.transformers import AutoModel, AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--params_path", type=str, required=True,
default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.")
parser.add_argument("--output_path", type=str, default='./output',
help="The path of model parameter in static graph to be saved.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--output_emb_size", default=256, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids
paddle.static.InputSpec(shape=[None, None], dtype="int64"), # segment_ids
],
)
# Save in static graph model.
save_path = os.path.join(args.output_path, "inference")
paddle.jit.save(model, save_path)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle_serving_client.io as serving_io
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--dirname", type=str, required=True,
default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
parser.add_argument("--model_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
parser.add_argument("--params_filename", type=str, required=True,
default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
parser.add_argument("--server_path", type=str, default='./serving_server',
help="The path of server parameter in static graph to be saved.")
parser.add_argument("--client_path", type=str, default='./serving_client',
help="The path of client parameter in static graph to be saved.")
parser.add_argument("--feed_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
parser.add_argument("--fetch_alias_names", type=str, default=None,
help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
parser.add_argument("--show_proto", type=bool, default=False,
help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
# yapf: enable
if __name__ == "__main__":
args = parser.parse_args()
serving_io.inference_model_to_serving(
dirname=args.dirname,
serving_server=args.server_path,
serving_client=args.client_path,
model_filename=args.model_filename,
params_filename=args.params_filename,
show_proto=args.show_proto,
feed_alias_names=args.feed_alias_names,
fetch_alias_names=args.fetch_alias_names,
)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import numpy as np
import paddle
from paddle import inference
from tqdm import tqdm
from paddlenlp.data import Pad, Tuple
from paddlenlp.transformers import AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
parser.add_argument("--corpus_file", type=str, required=True, help="The corpus_file path.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
args = parser.parse_args()
# yapf: enable
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
class Predictor(object):
def __init__(
self,
model_dir,
device="gpu",
max_seq_length=128,
batch_size=32,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False,
):
self.max_seq_length = max_seq_length
self.batch_size = batch_size
model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as initialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8,
}
precision_mode = precision_map[precision]
if args.use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(args.cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
def predict(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the predictions labels.
"""
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # segment
),
):
return fn(samples)
all_embeddings = []
examples = []
for idx, text in enumerate(tqdm(data)):
input_ids, segment_ids = convert_example(
text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True
)
examples.append((input_ids, segment_ids))
if len(examples) >= self.batch_size:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
examples = []
if len(examples) > 0:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
all_embeddings = np.concatenate(all_embeddings, axis=0)
np.save("corpus_embedding", all_embeddings)
def read_text(file_path):
file = open(file_path)
id2corpus = {}
for idx, data in enumerate(file.readlines()):
id2corpus[idx] = data.strip()
return id2corpus
if __name__ == "__main__":
predictor = Predictor(
args.model_dir,
args.device,
args.max_seq_length,
args.batch_size,
args.use_tensorrt,
args.precision,
args.cpu_threads,
args.enable_mkldnn,
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
id2corpus = read_text(args.corpus_file)
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
predictor.predict(corpus_list, tokenizer)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import numpy as np
from config import collection_name, embedding_name, partition_tag
from milvus_util import RecallByMilvus, VecToMilvus, text_max_len
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_path", default="data/corpus.csv", type=str, required=True, help="The data for vector extraction."
)
parser.add_argument(
"--embedding_path", default="corpus_embedding.npy", type=str, required=True, help="The vector path for data."
)
parser.add_argument("--index", default=0, type=int, help="index of the vector for search")
parser.add_argument("--insert", action="store_true", help="whether to insert data")
parser.add_argument("--search", action="store_true", help="whether to search data")
parser.add_argument("--batch_size", default=100000, type=int, help="number of examples to insert each time")
args = parser.parse_args()
def read_text(file_path):
file = open(file_path)
id2corpus = []
for idx, data in enumerate(file.readlines()):
question, answer = data.strip().split("\t")
id2corpus.append({"question": question, "answer": answer})
return id2corpus
def milvus_data_insert(data_path, embedding_path, batch_size):
corpus_list = read_text(data_path)
embeddings = np.load(embedding_path)
embedding_ids = [i for i in range(embeddings.shape[0])]
client = VecToMilvus()
client.drop_collection(collection_name)
data_size = len(embedding_ids)
for i in tqdm(range(0, data_size, batch_size)):
cur_end = i + batch_size
if cur_end > data_size:
cur_end = data_size
batch_emb = embeddings[np.arange(i, cur_end)]
entities = [
[j for j in range(i, cur_end, 1)],
[corpus_list[j]["question"][: text_max_len - 1] for j in range(i, cur_end, 1)],
[corpus_list[j]["answer"][: text_max_len - 1] for j in range(i, cur_end, 1)],
batch_emb, # field embeddings, supports numpy.ndarray and list
]
client.insert(
collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
)
def milvus_data_recall(embedding_path, index):
embeddings = np.load(embedding_path)
embedding_ids = [i for i in range(embeddings.shape[0])]
recall_client = RecallByMilvus()
if index > len(embedding_ids):
print("Index should not be larger than embedding size")
return
embeddings = embeddings[np.arange(index, index + 1)]
time_start = time.time()
result = recall_client.search(
embeddings, embedding_name, collection_name, partition_names=[partition_tag], output_fields=["pk", "text"]
)
time_end = time.time()
sum_t = time_end - time_start
print("time cost", sum_t, "s")
for hits in result:
for hit in hits:
print(f"hit: {hit}, text field: {hit.entity.get('text')}")
if __name__ == "__main__":
if args.insert:
milvus_data_insert(args.data_path, args.embedding_path, args.batch_size)
if args.search:
milvus_data_recall(args.embedding_path, args.index)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from config import (
MILVUS_HOST,
MILVUS_PORT,
data_dim,
index_config,
search_params,
top_k,
)
from pymilvus import (
Collection,
CollectionSchema,
DataType,
FieldSchema,
connections,
utility,
)
fmt = "\n=== {:30} ===\n"
text_max_len = 1000
fields = [
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=text_max_len),
FieldSchema(name="answer", dtype=DataType.VARCHAR, max_length=text_max_len),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim),
]
schema = CollectionSchema(fields, "Neural Search Index")
class VecToMilvus:
def __init__(self):
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
self.collection = None
def has_collection(self, collection_name):
try:
has = utility.has_collection(collection_name)
print(f"Does collection {collection_name} exist in Milvus: {has}")
return has
except Exception as e:
print("Milvus has_table error:", e)
def creat_collection(self, collection_name):
try:
print(fmt.format("Create collection {}".format(collection_name)))
self.collection = Collection(collection_name, schema, consistency_level="Strong")
except Exception as e:
print("Milvus create collection error:", e)
def drop_collection(self, collection_name):
try:
utility.drop_collection(collection_name)
except Exception as e:
print("Milvus delete collection error:", e)
def create_index(self, index_name):
try:
print(fmt.format("Start Creating index"))
self.collection.create_index(index_name, index_config)
print(fmt.format("Start loading"))
self.collection.load()
except Exception as e:
print("Milvus create index error:", e)
def has_partition(self, partition_tag):
try:
result = self.collection.has_partition(partition_tag)
return result
except Exception as e:
print("Milvus has partition error: ", e)
def create_partition(self, partition_tag):
try:
self.collection.create_partition(partition_tag)
print("create partition {} successfully".format(partition_tag))
except Exception as e:
print("Milvus create partition error: ", e)
def insert(self, entities, collection_name, index_name, partition_tag=None):
try:
if not self.has_collection(collection_name):
self.creat_collection(collection_name)
self.create_index(index_name)
else:
self.collection = Collection(collection_name)
if (partition_tag is not None) and (not self.has_partition(partition_tag)):
self.create_partition(partition_tag)
self.collection.insert(entities, partition_name=partition_tag)
print(f"Number of entities in Milvus: {self.collection.num_entities}") # check the num_entites
except Exception as e:
print("Milvus insert error:", e)
class RecallByMilvus:
def __init__(self):
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
self.collection = None
def get_collection(self, collection_name):
try:
print(fmt.format("Connect collection {}".format(collection_name)))
self.collection = Collection(collection_name)
except Exception as e:
print("Milvus create collection error:", e)
def search(self, vectors, embedding_name, collection_name, partition_names=[], output_fields=[]):
try:
self.get_collection(collection_name)
result = self.collection.search(
vectors,
embedding_name,
search_params,
limit=top_k,
partition_names=partition_names,
output_fields=output_fields,
)
return result
except Exception as e:
print("Milvus recall error: ", e)
if __name__ == "__main__":
print(fmt.format("Start inserting entities"))
rng = np.random.default_rng(seed=19530)
num_entities = 3000
entities = [
# provide the pk field because `auto_id` is set to False
[i for i in range(num_entities)],
["第{}个样本".format(i) for i in range(num_entities)], # field text, only supports list
rng.random((num_entities, data_dim)), # field embeddings, supports numpy.ndarray and list
]
print(entities[-1].shape)
collection_name = "test1"
partition_tag = "partition_1"
embedding_name = "embeddings"
client = VecToMilvus()
client.insert(
collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
)
print(fmt.format("Start searching entities"))
vectors_to_search = entities[-1][-2:]
recall_client = RecallByMilvus()
result = recall_client.search(
vectors_to_search,
embedding_name,
collection_name,
partition_names=[partition_tag],
output_fields=["pk", "text"],
)
for hits in result:
for hit in hits:
print(f"hit: {hit}, random field: {hit.entity.get('text')}")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddlenlp
class SimCSE(nn.Layer):
def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off between
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(768, output_emb_size, weight_attr=weight_attr)
self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.classifier = nn.Linear(output_emb_size, 2)
self.rdrop_loss = paddlenlp.losses.RDropLoss()
@paddle.jit.to_static(
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
]
)
def get_pooled_embedding(
self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, with_pooler=True
):
# Note: cls_embedding is poolerd embedding with act tanh
sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if with_pooler is False:
cls_embedding = sequence_output[:, 0, :]
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
with_pooler=True,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask, with_pooler=with_pooler
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask, with_pooler=with_pooler
)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
return cosine_sim
def forward(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
)
logits1 = self.classifier(query_cls_embedding)
logits2 = self.classifier(title_cls_embedding)
kl_loss = self.rdrop_loss(logits1, logits2)
cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
# substract margin from all positive samples cosine_sim()
margin_diag = paddle.full(
shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
)
cosine_sim = cosine_sim - paddle.diag(margin_diag)
# scale cosine to ease training converge
cosine_sim *= self.sacle
labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
loss = F.cross_entropy(input=cosine_sim, label=labels)
return loss, kl_loss
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from functools import partial
import paddle
from ann_util import build_index
from data import convert_example_test, create_dataloader, gen_id2corpus, gen_text_file
from model import SimCSE
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
from paddlenlp.utils.log import logger
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--corpus_file", type=str, required=True, help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str, required=True, help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='recall_result', help="The full path of recall result file to save")
parser.add_argument("--recall_result_file", type=str, default='recall_result_file', help="The file name of recall result")
parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=None, type=int, help="output_embedding_size")
parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
),
):
return [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
model = paddle.DataParallel(model)
# Load pretrained semantic model
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
logger.info("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
# Need better way to get inner model of DataParallel
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(
query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
if not os.path.exists(args.recall_result_dir):
os.mkdir(args.recall_result_dir)
recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
with open(recall_result_file, "w", encoding="utf-8") as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = args.batch_size * batch_index + row_index
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write(
"{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
)
)
pymilvus>=2.1.0
pandas==0.25.1
paddlenlp>=2.3.7
paddlepaddle-gpu>=2.2.3
hnswlib>=0.5.2
numpy>=1.17.2
visualdl>=2.2.2
pybind11
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
import pandas as pd
from config import collection_name, embedding_name, partition_tag
from milvus_util import RecallByMilvus
from paddle_serving_server.pipeline import PipelineClient
def recall_result(list_data):
client = PipelineClient()
client.connect(["127.0.0.1:8080"])
feed = {}
for i, item in enumerate(list_data):
feed[str(i)] = item
start_time = time.time()
ret = client.predict(feed_dict=feed)
end_time = time.time()
print("Extract feature time to cost :{} seconds".format(end_time - start_time))
result = np.array(eval(ret.value[0]))
return result
def search_in_milvus(embeddings, query_text):
recall_client = RecallByMilvus()
start_time = time.time()
results = recall_client.search(
embeddings,
embedding_name,
collection_name,
partition_names=[partition_tag],
output_fields=["pk", "question", "answer"],
)
end_time = time.time()
print("Search milvus time cost is {} seconds ".format(end_time - start_time))
list_data = []
for line in results:
for item in line:
distance = item.distance
question = item.entity.get("question")
answer = item.entity.get("answer")
print(question, answer, distance)
list_data.append([query_text, question, answer, distance])
df = pd.DataFrame(list_data, columns=["query_text", "question", "answer", "distance"])
df.to_csv("faq_result.csv", index=False)
if __name__ == "__main__":
list_data = ["买了社保,是不是就不用买商业保险了?"]
result = recall_result(list_data)
df = search_in_milvus(result, list_data[0])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python -u evaluate.py \
--similar_text_pair "data/test_pair.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 10
\ No newline at end of file
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python export_model.py --params_path checkpoints/model_100/model_state.pdparams \
--output_path=./output \
--model_name_or_path rocketqa-zh-base-query-encoder
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment